153
162
# ('Mammalia', 'Hominidae', 'Homo', 'Homo sapiens')
155
164
# Clearly with limited information we can't do a perfect job, but we can try
156
# and do a reasonable one.
162
# All that said, let's just do the trivial implementation now and see how
163
# bad it is! (assumes ranks are unique except for 'no rank')
166
my $first_lineage = $self->{db}->{node_ids} ? 0 : 1;
168
my $ancestor_node_id;
165
# and do a reasonable one. So, let's just do the trivial implementation now
166
# and see how bad it is! (assumes ranks are unique except for 'no rank')
168
my $ancestors = $self->{ancestors};
169
my $node_data = $self->{node_data};
170
my $name_to_id = $self->{name_to_id};
171
my $children = $self->{children};
173
my $my_ancestor_id = '';
170
for my $i (0..$#names) {
171
my $name = $names[$i];
172
my $rank = $ranks[$i];
175
for my $i (0 .. $names_idx) {
176
my $name = $names->[$i];
177
my $rank = $ranks->[$i]; # if undef, this node has 'no rank'
174
179
# This is a new node with a new id if we haven't seen this name before.
175
180
# It's also always a new node if this is the first lineage going into
178
183
# We need to handle, however, situations in the future where we try to
179
184
# merge in a new lineage but we have non-unique names in the lineage
180
# and possible missing classes in some lineages
182
# '... Anophelinae, Anopheles, Anopheles, Angusticorn, Anopheles...'
185
# and possible missing classes in some lineages, e.g.
186
# '... Anophelinae, Anopheles, Anopheles, Angusticorn, Anopheles...'
184
# '... Anophelinae, Anopheles, Angusticorn, Anopheles...'),
188
# '... Anophelinae, Anopheles, Angusticorn, Anopheles...'),
185
189
# but still need the 'tree' to be correct
188
if ($first_lineage || ! exists $self->{db}->{name_to_id}->{$name}) {
191
# Look for a node that is consistent with this lineage
194
my @same_named = @{$self->{db}->{name_to_id}->{$name}};
193
SAME_NAMED: for my $same_id (@{$name_to_id->{$name}}) {
195
# Taxa are the same if it they have the same ancestor or none
196
my $this_ancestor_id = $ancestors->{$same_id} || '';
197
if ($my_ancestor_id eq $this_ancestor_id) {
196
# look for the node that is consistent with this lineage
197
SAME_NAMED: foreach my $s_id (@same_named) {
198
my $this_ancestor_id;
199
if ($ancestor_node_id) {
200
$this_ancestor_id = $self->{db}->{ancestors}->{$s_id};
201
if ($ancestor_node_id eq $this_ancestor_id) {
203
next if $i >= $names_idx; # this taxon has no child
204
my $my_child_name = $names->[$i + 1];
205
#while ( my ($this_child_id, undef) = each %{$children->{$same_id}} ) {
206
for my $this_child_id (keys %{$children->{$same_id}}) {
207
if ($my_child_name eq $node_data->{$this_child_id}->[0]) { # both children have same name
208
if ($my_ancestor_id) {
210
while ($this_ancestor_id = $ancestors->{$this_ancestor_id}) {
211
if ($my_ancestor_id eq $this_ancestor_id) {
212
$my_ancestor_id = $ancestors->{$same_id};
213
push @node_ids, @s_ancestors, $my_ancestor_id;
217
unshift @s_ancestors, $this_ancestor_id;
220
# This new lineage (@$names) doesn't start at the
221
# same root as the existing lineages. Assuming
222
# '$name' corresponds to node $same_id");
207
if ($names[$i + 1]) {
208
my $my_child_name = $names[$i + 1];
209
my @children_ids = keys %{$self->{db}->{children}->{$s_id} || {}};
210
foreach my $c_id (@children_ids) {
211
my $this_child_name = $self->{db}->{node_data}->{$c_id}->[0];
212
if ($my_child_name eq $this_child_name) {
214
if ($ancestor_node_id) {
216
while ($this_ancestor_id = $self->{db}->{ancestors}->{$this_ancestor_id}) {
217
if ($ancestor_node_id eq $this_ancestor_id) {
219
$ancestor_node_id = $self->{db}->{ancestors}->{$s_id};
220
push(@node_ids, @s_ancestors, $ancestor_node_id);
223
unshift(@s_ancestors, $this_ancestor_id);
227
#$self->warn("This new lineage (@names) doesn't start at the same root as the existing lineages.".
228
# "\nI'm assuming '$name' corresponds to node $s_id");
237
$node_id || $is_new++;
241
my $next_num = ++$self->{db}->{node_ids};
242
# 'list' so definitely not confused with ncbi taxonomy ids
243
$node_id = 'list'.$next_num;
244
push(@{$self->{db}->{name_to_id}->{$name}}, $node_id);
247
unless (exists $self->{db}->{node_data}->{$node_id}) {
248
$self->{db}->{node_data}->{$node_id} = [($name, '')];
250
my $node_data = $self->{db}->{node_data}->{$node_id};
252
if (!$node_data->[1] || ($node_data->[1] eq 'no rank' && $rank ne 'no rank')) {
253
$node_data->[1] = $rank;
256
if ($ancestor_node_id) {
257
if ($self->{db}->{ancestors}->{$node_id} && $self->{db}->{ancestors}->{$node_id} ne $ancestor_node_id) {
258
$self->throw("This lineage (".join(', ', @names).") and a previously computed lineage share a node name but have different ancestries for that node. Can't cope!");
260
$self->{db}->{ancestors}->{$node_id} = $ancestor_node_id;
263
$ancestor_node_id = $node_id;
264
push(@node_ids, $node_id);
230
if (not defined $node_id) {
231
# This is a new node. Add it to the database, using the prefix 'list'
232
# for its ID to distinguish it from the IDs from other taxonomies.
233
my $next_num = ++$self->{node_ids};
234
$node_id = $prefix.$next_num;
235
push @{$self->{name_to_id}->{$name}}, $node_id;
236
$self->{node_data}->{$node_id}->[0] = $name;
239
if ( (defined $rank) && (not defined $node_data->{$node_id}->[1]) ) {
240
# Save rank if node in database has no rank but the current node has one
241
$self->{node_data}->{$node_id}->[1] = $rank;
244
if ($my_ancestor_id) {
245
if ($self->{ancestors}->{$node_id} && $self->{ancestors}->{$node_id} ne $my_ancestor_id) {
246
$self->throw("The lineage '".join(', ', @$names)."' and a ".
247
"previously stored lineage share a node name but have ".
248
"different ancestries for that node. Can't cope!");
250
$self->{ancestors}->{$node_id} = $my_ancestor_id;
253
$my_ancestor_id = $node_id;
254
push @node_ids, $node_id;
267
# go through the lineage in reverse so we can remember the children
269
foreach my $node_id (reverse @node_ids) {
271
$child_id = $node_id;
275
$self->{db}->{children}->{$node_id}->{$child_id} = 1;
276
$child_id = $node_id;
257
# Go through the lineage in reverse so we can remember the children
258
for (my $i = $names_idx - 1; $i >= 0; $i--) {
259
$self->{children}->{$node_ids[$i]}->{$node_ids[$i+1]} = undef;
280
265
=head2 Bio::DB::Taxonomy Interface implementation
270
Usage : my $num = $db->get_num_taxa();
271
Function: Get the number of taxa stored in the database.
279
return $self->{node_ids} || 0;
286
285
Title : get_taxon
287
286
Usage : my $taxon = $db->get_taxon(-taxonid => $taxonid)
288
287
Function: Get a Bio::Taxon object from the database.
289
288
Returns : Bio::Taxon object
290
Args : just a single value which is the database id, OR named args:
291
-taxonid => taxonomy id (to query by taxonid; NB: these are not
292
NCBI taxonomy ids but 'list' pre-fixed ids unique to the
295
-name => string (to query by a taxonomy name)
289
Args : A single value which is the ID of the taxon to retrieve
290
OR named args, as follows:
291
-taxonid => Taxonomy ID (NB: these are not NCBI taxonomy ids but
292
'list' pre-fixed ids unique to the list database).
294
-name => String (to query by a taxonomy name). A given taxon name
295
can match different taxonomy objects. When that is the
296
case, a warning is displayed and the first matching taxon
297
is reported. See get_taxonids() to get all matching taxon
300
-names => Array ref of lineage names, like in add_lineage(). To
301
overcome the limitations of -name, you can use -names to
302
provide the full lineage of the taxon you want and get a
303
unique, unambiguous taxon object.
301
my ($taxonid, $name);
304
($taxonid, $name) = $self->_rearrange([qw(TAXONID NAME)],@_);
308
my ($self, @args) = @_;
311
if (scalar @args == 1) {
312
# Argument is a taxon ID
315
# Got named arguments
317
($taxonid, $name, $names) = $self->_rearrange([qw(TAXONID NAME NAMES)], @args);
306
($taxonid, my @others) = $self->get_taxonids($name);
307
$self->warn("There were multiple ids ($taxonid @others) matching '$name', using '$taxonid'") if @others > 0;
314
my $node = $self->{db}->{node_data}->{$taxonid} || return;
315
my ($sci_name, $rank) = @{$node};
317
my $taxon = Bio::Taxon->new(
319
-object_id => $taxonid, # since this is NOT a real ncbi taxid, set it as simply the object id
321
# we can't use -dbh or the db_handle() method ourselves or we'll go
322
# infinite on the merge attempt
323
$taxon->{'db_handle'} = $self;
325
$self->_handle_internal_id($taxon, 1);
322
$name = $names->[-1];
324
my @taxonids = $self->get_taxonids($name);
325
$taxonid = $taxonids[0];
327
# Use provided lineage to find correct ID amongst several matching IDs
328
if ( (scalar @taxonids > 1) && (scalar @$names > 1) ) {
329
for my $query_taxonid (@taxonids) {
331
my $db_ancestor = $self->get_taxon($query_taxonid);
332
for (my $i = $#$names-1; $i >= 0; $i--) {
333
my $query_ancestor_name = $names->[$i];
334
$db_ancestor = $db_ancestor->ancestor;
335
my $db_ancestor_name = '';
337
$db_ancestor_name = $db_ancestor->node_name;
339
if (not ($query_ancestor_name eq $db_ancestor_name) ) {
341
last; # done testing this taxonid
345
@taxonids = [$query_taxonid];
346
$taxonid = $query_taxonid;
347
last; # done testing all taxonids
352
# Warn if several taxon IDs matched
353
if (scalar @taxonids > 1) {
354
$self->warn("There were multiple ids (@taxonids) matching '$name',".
355
" using '$taxonid'") if scalar @taxonids > 1;
361
# Now that we have the taxon ID, retrieve the corresponding Taxon object
363
my $node = $self->{node_data}->{$taxonid};
365
my ($sci_name, $rank) = @$node;
366
$taxon = Bio::Taxon->new(
368
-object_id => $taxonid, # not an ncbi taxid, simply an object id
375
# we can't use -dbh or the db_handle() method ourselves or we'll go
376
# infinite on the merge attempt
377
$taxon->{'db_handle'} = $self;
379
$self->_handle_internal_id($taxon, 1);
330
385
*get_Taxonomy_Node = \&get_taxon;
332
388
=head2 get_taxonids
334
390
Title : get_taxonids