22
24
ok my $db_entrez = Bio::DB::Taxonomy->new(-source => 'entrez');
24
ok my $db_flatfile = Bio::DB::Taxonomy->new(-source => 'flatfile',
25
-directory => $temp_dir,
26
-nodesfile => test_input_file('taxdump', 'nodes.dmp'),
27
-namesfile => test_input_file('taxdump','names.dmp'),
25
isa_ok $db_entrez, 'Bio::DB::Taxonomy::entrez';
26
isa_ok $db_entrez, 'Bio::DB::Taxonomy';
28
ok my $db_flatfile = Bio::DB::Taxonomy->new(
29
-source => 'flatfile',
30
-nodesfile => test_input_file('taxdump', 'nodes.dmp'),
31
-namesfile => test_input_file('taxdump','names.dmp'),
33
isa_ok $db_flatfile, 'Bio::DB::Taxonomy::flatfile';
34
isa_ok $db_flatfile, 'Bio::DB::Taxonomy';
36
ok $db_flatfile = Bio::DB::Taxonomy->new(
37
-source => 'flatfile',
38
-directory => $temp_dir,
39
-nodesfile => test_input_file('taxdump', 'nodes.dmp'),
40
-namesfile => test_input_file('taxdump','names.dmp'),
31
foreach my $db ($db_entrez, $db_flatfile) {
45
for my $db ($db_entrez, $db_flatfile) {
33
test_skip(-tests => 38, -requires_networking => 1) if $db eq $db_entrez;
47
test_skip(-tests => 46, -requires_networking => 1) if $db eq $db_entrez;
50
if ($db eq $db_entrez) {
51
cmp_ok $db->get_num_taxa, '>', 880_000; # 886,907 as of 08-May-2012
53
is $db->get_num_taxa, 189;
35
56
eval { $id = $db->get_taxonid('Homo sapiens');};
36
57
skip "Unable to connect to entrez database; no network or server busy?", 38 if $@;
40
61
# easy test on human, try out the main Taxon methods
41
62
ok $n = $db->get_taxon(9606);
44
65
is $n->ncbi_taxid, $n->id;
45
66
is $n->parent_id, 9605;
46
67
is $n->rank, 'species';
48
69
is $n->node_name, 'Homo sapiens';
49
70
is $n->scientific_name, $n->node_name;
50
71
is ${$n->name('scientific')}[0], $n->node_name;
52
73
my %common_names = map { $_ => 1 } $n->common_names;
53
74
is keys %common_names, 3, ref($db).": common names";
54
75
ok exists $common_names{human};
55
76
ok exists $common_names{man};
57
78
is $n->division, 'Primates';
58
79
is $n->genetic_code, 1;
59
80
is $n->mitochondrial_genetic_code, 2;
63
84
ok defined $n->create_date;
64
85
ok defined $n->update_date;
67
88
# briefly test some Bio::Tree::NodeI methods
68
89
ok my $ancestor = $n->ancestor;
69
90
is $ancestor->scientific_name, 'Homo';
70
91
# unless set explicitly, Bio::Taxon doesn't return anything for
71
92
# each_Descendent; must ask the database directly
72
ok my @children = $ancestor->db_handle->each_Descendent($ancestor);
93
ok my @children = $ancestor->db_handle->each_Descendent($ancestor);
94
cmp_ok @children, '>', 0;
75
96
sleep(3) if $db eq $db_entrez;
77
98
# do some trickier things...
78
99
ok my $n2 = $db->get_Taxonomy_Node('89593');
79
100
is $n2->scientific_name, 'Craniata';
81
102
# briefly check we can use some Tree methods
82
103
my $tree = Bio::Tree::Tree->new();
83
104
is $tree->get_lca($n, $n2)->scientific_name, 'Craniata';
107
my @nodes = $tree->get_nodes;
108
is scalar(@nodes), 0;
110
@lineage_nodes = $tree->get_lineage_nodes($n->id); # read ID, only works if nodes have been added to tree
111
is scalar @lineage_nodes, 0;
112
@lineage_nodes = $tree->get_lineage_nodes($n); # node object always works
113
cmp_ok(scalar @lineage_nodes, '>', 20);
116
like($tree->get_lineage_string($n), qr/cellular organisms;Eukaryota/);
117
like($tree->get_lineage_string($n,'-'), qr/cellular organisms-Eukaryota/);
118
like($tree->get_lineage_string($n2), qr/cellular organisms;Eukaryota/);
85
120
# can we actually form a Tree and use other Tree methods?
86
121
ok $tree = Bio::Tree::Tree->new(-node => $n);
87
is $tree->number_nodes, 30;
88
is $tree->get_nodes, 30;
122
cmp_ok($tree->number_nodes, '>', 20);
123
cmp_ok(scalar($tree->get_nodes), '>', 20);
89
124
is $tree->find_node(-rank => 'genus')->scientific_name, 'Homo';
91
126
# check that getting the ancestor still works now we have explitly set the
92
127
# ancestor by making a Tree
93
128
is $n->ancestor->scientific_name, 'Homo';
95
130
sleep(3) if $db eq $db_entrez;
97
132
ok $n = $db->get_Taxonomy_Node('1760');
98
133
is $n->scientific_name, 'Actinobacteria';
100
135
sleep(3) if $db eq $db_entrez;
102
137
# entrez isn't as good at searching as flatfile, so we have to special-case
103
my @ids = $db->get_taxonids('Chloroflexi');
104
$db eq $db_entrez ? (is @ids, 1) : (is @ids, 2);
138
my @ids = sort $db->get_taxonids('Chloroflexi');
140
is_deeply \@ids, [200795, 32061];
105
142
$id = $db->get_taxonids('Chloroflexi (class)');
143
$db eq $db_entrez ? is($id, undef) : is($id, 32061);
108
145
@ids = $db->get_taxonids('Rhodotorula');
109
146
cmp_ok @ids, '>=' , 8;
110
147
@ids = $db->get_taxonids('Rhodotorula <Microbotryomycetidae>');
116
154
# Test the list database
156
ok my $db_list = Bio::DB::Taxonomy->new(-source => 'list');
157
isa_ok $db_list, 'Bio::DB::Taxonomy::list';
158
isa_ok $db_list, 'Bio::DB::Taxonomy';
117
160
my @ranks = qw(superkingdom class genus species);
118
161
my @h_lineage = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
119
my $db_list = Bio::DB::Taxonomy->new(-source => 'list', -names => \@h_lineage,
162
ok $db_list = Bio::DB::Taxonomy->new(
164
-names => \@h_lineage,
167
is $db_list->get_num_taxa, 4;
170
ok @taxa = map {$db_list->get_taxon(-name=>$_)} @h_lineage;
171
is_deeply [map {ref($_)} @taxa], [('Bio::Taxon')x4];
172
is_deeply [map {$_->rank} @taxa], \@ranks, 'Ranks';
174
@h_lineage = ('Eukaryota', 'Mammalia', 'Homo', 'Homo erectus');
175
$db_list->add_lineage(-names => \@h_lineage, -ranks => \@ranks);
177
ok @taxa = map {$db_list->get_taxon(-name=>$_)} @h_lineage;
178
is_deeply [map {ref($_)} @taxa], [('Bio::Taxon')x4];
179
is_deeply [map {$_->rank} @taxa], \@ranks, 'Ranks';
182
ok my $tree = $db_list->get_tree('Homo sapiens', 'Homo erectus');
183
isa_ok $tree, 'Bio::Tree::TreeI';
184
is $tree->number_nodes, 5;
185
is $tree->total_branch_length, 4;
186
ok my $node1 = $tree->find_node( -scientific_name => 'Homo sapiens' );
187
ok my $node2 = $tree->find_node( -scientific_name => 'Homo erectus' );
188
is $tree->distance($node1, $node2), 2;
123
190
ok my $h_list = $db_list->get_taxon(-name => 'Homo sapiens');
124
191
ok my $h_flat = $db_flatfile->get_taxon(-name => 'Homo sapiens');
159
226
# get_lca should work on nodes from different databases
161
test_skip(-tests => 5, -requires_networking => 1);
228
test_skip(-tests => 9, -requires_networking => 1);
230
# check that the result is the same as if we are retrieving from the same DB
162
232
$h_flat = $db_flatfile->get_taxon(-name => 'Homo');
233
my $h_flat2 = $db_flatfile->get_taxon(-name => 'Homo sapiens');
234
ok my $tree_functions = Bio::Tree::Tree->new();
235
is $tree_functions->get_lca($h_flat, $h_flat2)->scientific_name, 'Homo', 'get_lca() within flatfile db';
164
239
eval { $h_entrez = $db_entrez->get_taxon(-name => 'Homo sapiens');};
165
skip "Unable to connect to entrez database; no network or server busy?", 5 if $@;
167
ok my $tree_functions = Bio::Tree::Tree->new();
168
is $tree_functions->get_lca($h_flat, $h_entrez)->scientific_name, 'Homo';
240
skip "Unable to connect to entrez database; no network or server busy?", 7 if $@;
242
eval { $h_entrez2 = $db_entrez->get_taxon(-name => 'Homo');};
243
skip "Unable to connect to entrez database; no network or server busy?", 7 if $@;
244
ok $tree_functions = Bio::Tree::Tree->new();
245
is $tree_functions->get_lca($h_entrez, $h_entrez2)->scientific_name, 'Homo', 'get_lca() within entrez db';
247
ok $tree_functions = Bio::Tree::Tree->new();
248
# mixing entrez and flatfile
250
local $TODO = 'Mixing databases for get_lca() not working, see bug #3416';
251
is $tree_functions->get_lca($h_flat, $h_entrez)->scientific_name, 'Homo', 'get_lca() mixing flatfile and remote db';
170
253
# even though the species taxa for Homo sapiens from list and flat databases
171
254
# have the same internal id, get_lca won't work because they have different
172
255
# roots and descendents
203
286
my $ids = join(",", map { $_->id } $tree->get_nodes);
204
287
is $ids, '131567,9606';
289
# More thorough tests of merge_lineage
290
ok my $node = $db_list->get_taxon(-name => 'Eukaryota');
291
$tree = Bio::Tree::Tree->new(-node => $node);
292
ok $node = $db_list->get_taxon(-name => 'Homo erectus');
293
ok $tree->merge_lineage($node);
294
for my $name ('Eukaryota', 'Mammalia', 'Homo', 'Homo erectus') {
295
ok $node = $tree->find_node(-scientific_name => $name);
206
298
# we can recursively fetch all descendents of a taxon
208
300
test_skip(-tests => 1, -requires_networking => 1);
209
301
eval {$db_entrez->get_taxon(10090);};
210
302
skip "Unable to connect to entrez database; no network or server busy?", 1 if $@;
212
304
my $lca = $db_entrez->get_taxon(314146);
213
305
my @descs = $db_entrez->get_all_Descendents($lca);
214
306
cmp_ok @descs, '>=', 17;
218
310
$db_list = Bio::DB::Taxonomy->new(-source => 'list',
220
312
(split(/,\s+/, "cellular organisms, Eukaryota, Fungi/Metazoa group,
221
313
Metazoa, Eumetazoa, Bilateria, Coelomata, Protostomia, Panarthropoda,
222
314
Arthropoda, Mandibulata, Pancrustacea, Hexapoda, Insecta, Dicondylia,
225
317
Anopheles, maculipennis group, maculipennis species complex, Anopheles daciae"))]);
227
319
my @taxonids = $db_list->get_taxonids('Anopheles');
320
is @taxonids, 3, 'List context';
322
my $taxonid = $db_list->get_taxonids('Anopheles');
323
isa_ok \$taxonid, 'SCALAR', 'Scalar context';
324
ok exists { map({$_ => undef} @taxonids) }->{$taxonid};
230
326
# but we should still be able to merge in an incomplete lineage of a sister
231
327
# species and have the 'tree' remain consistent:
234
330
$db_list->add_lineage(-names => [
235
331
(split(/,\s+/, "Anophelinae, Anopheles, Anopheles, Angusticorn,
236
332
maculipennis group, maculipennis species complex, Anopheles labranchiae"))]);
237
my $node = $db_list->get_taxon(-name => 'Anopheles labranchiae');
333
$node = $db_list->get_taxon(-name => 'Anopheles labranchiae');
238
334
is $node->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->ancestor->scientific_name, 'Anophelinae';
335
is $node->rank, undef;
240
337
# missing 'subgenus' Anopheles
241
338
$db_list->add_lineage(-names => [
252
349
is $node->ancestor->ancestor->ancestor->ancestor->scientific_name, 'Angusticorn';
254
351
@taxonids = $db_list->get_taxonids('Anopheles');
352
is scalar @taxonids, 3;
354
# bug: duplicate topmost taxa
355
$db_list = Bio::DB::Taxonomy->new( -source => 'list',
356
-names => ['Bacteria', 'Tenericutes'] );
357
$db_list->add_lineage( -names => ['Bacteria'] );
358
@taxonids = $db_list->get_taxonids('Bacteria');
359
is scalar @taxonids, 1;
361
# Disambiguate between taxa with same name using -names
362
ok $db_list = Bio::DB::Taxonomy->new( -source => 'list' ), 'DB with ambiguous names';
363
ok $db_list->add_lineage( -names => ['c__Gammaproteobacteria', 'o__Oceanospirillales', 'f__Alteromonadaceae', 'g__Spongiibacter'] );
364
ok $db_list->add_lineage( -names => ['c__Gammaproteobacteria', 'o__Alteromonadales' , 'f__Alteromonadaceae', 'g__Alteromonas' ] );
366
ok @taxonids = $db_list->get_taxonids('f__Alteromonadaceae');
367
is scalar @taxonids, 2; # multiple taxa would match using $db_list->get_taxon(-name => 'f__Alteromonadaceae')
369
ok $node = $db_list->get_taxon( -names => ['c__Gammaproteobacteria', 'o__Alteromonadales' , 'f__Alteromonadaceae'] );
370
is $node->ancestor->node_name, 'o__Alteromonadales';
371
my $iid = $node->internal_id;
373
ok $node = $db_list->get_taxon( -names => ['c__Gammaproteobacteria', 'o__Oceanospirillales', 'f__Alteromonadaceae'] );
374
is $node->ancestor->node_name, 'o__Oceanospirillales';
375
isnt $node->internal_id, $iid;
378
# More tests with ambiguous names, internal IDs and multiple databases
379
my ($node3, $node4, $db_list_2);
380
ok $db_list = Bio::DB::Taxonomy->new( -source => 'list' );
381
ok $db_list->add_lineage( -names => [ 'o__Enterobacteriales', 'g__Escherichia' ] );
382
ok $db_list->add_lineage( -names => [ 'o__Pseudomonadales' , 'g__Pseudomonas' ] );
383
ok $db_list->add_lineage( -names => [ 'o__Chroococcales' , 'g__Microcoleus' ] );
384
ok $node1 = $db_list->get_taxon( -names => [ 'k__Chroococcales', 'g__Microcoleus' ] );
386
ok $db_list_2 = Bio::DB::Taxonomy->new( -source => 'list' );
387
ok $db_list_2->add_lineage( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
388
ok $node2 = $db_list_2->get_taxon( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
390
is $node1->scientific_name, 'g__Microcoleus';
391
is $node2->scientific_name, 'g__Microcoleus'; # same taxon name
392
isnt $node1->id, $node2->id; # but different dbs and hence taxids
393
is $node1->internal_id, $node1->internal_id; # but same cross-database internal ID
395
ok $db_list->add_lineage( -names => [ 'o__Oscillatoriales' , 'g__Microcoleus' ] );
396
ok $db_list->add_lineage( -names => [ 'o__Acidobacteriales', 'g__Microcoleus' ] );
398
ok $node1 = $db_list->get_taxon( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
399
ok $node2 = $db_list->get_taxon( -names => [ 'o__Oscillatoriales' , 'g__Microcoleus' ] );
400
ok $node3 = $db_list->get_taxon( -names => [ 'o__Acidobacteriales' , 'g__Microcoleus' ] );
401
my @nodes = ($node1, $node2, $node3);
403
is map({$_->id => undef} @nodes), 6; # 3 distinct taxids
404
is map({$_->internal_id => undef} @nodes), 6; # 3 distinct iids
406
ok $db_list->add_lineage( -names => [ 'o__Chroococcales' , 'g__Microcoleus' ] );
407
ok $node2 = $db_list->get_taxon( -names => [ 'o__Chroococcales', 'g__Microcoleus' ] );
408
is $node2->scientific_name, $node1->scientific_name;
409
is $node2->id, $node1->id;
410
is $node2->internal_id, $node1->internal_id;