16
Bio::DB::Taxonomy::flatfile - An implementation of Bio::DB::Taxonomy
17
which uses local flat files
16
Bio::DB::Taxonomy::flatfile - Use the NCBI taxonomy from local indexed flat files
21
20
use Bio::DB::Taxonomy;
23
my $db = Bio::DB::Taxonomy->new(-source => 'flatfile',
24
-nodesfile => $nodesfile,
25
-namesfile => $namefile);
22
my $db = Bio::DB::Taxonomy->new(-source => 'flatfile' ,
23
-nodesfile => 'nodes.dmp',
24
-namesfile => 'names.dmp');
29
This is an implementation which uses local flat files and the DB_File
30
module RECNO data structures to manage a local copy of the NCBI
28
This is an implementation of Bio::DB::Taxonomy which stores and accesses the
29
NCBI taxonomy using flat files stored locally on disk and indexed using the
30
DB_File module RECNO data structure for fast retrieval.
33
Required database files can be obtained from
32
The required database files, nodes.dmp and names.dmp can be obtained from
34
33
ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
81
80
# Let the code begin...
83
82
package Bio::DB::Taxonomy::flatfile;
84
use vars qw($DEFAULT_INDEX_DIR $DEFAULT_NODE_INDEX $DEFAULT_NAME2ID_INDEX $DEFAULT_ID2NAME_INDEX
85
$NCBI_TAXONOMY_HOSTNAME $DEFAULT_PARENT_INDEX
86
$NCBI_TAXONOMY_FILE @DIVISIONS);
84
use vars qw($DEFAULT_INDEX_DIR $DEFAULT_NODE_INDEX $DEFAULT_NAME2ID_INDEX
85
$DEFAULT_ID2NAME_INDEX $DEFAULT_PARENT_INDEX @DIVISIONS);
90
use File::Spec::Functions;
91
92
use constant SEPARATOR => ':';
93
$DEFAULT_INDEX_DIR = '/tmp';
94
$DEFAULT_NODE_INDEX = 'nodes';
94
$DEFAULT_INDEX_DIR = $Bio::Root::IO::TEMPDIR; # /tmp
95
$DEFAULT_NODE_INDEX = 'nodes';
95
96
$DEFAULT_NAME2ID_INDEX = 'names2id';
96
97
$DEFAULT_ID2NAME_INDEX = 'id2names';
97
$DEFAULT_PARENT_INDEX = 'parents';
98
$NCBI_TAXONOMY_HOSTNAME = 'ftp.ncbi.nih.gov';
99
$NCBI_TAXONOMY_FILE = '/pub/taxonomy/taxdump.tar.gz';
98
$DEFAULT_PARENT_INDEX = 'parents';
101
100
$DB_BTREE->{'flags'} = R_DUP; # allow duplicate values in DB_File BTREEs
124
123
Args : -directory => name of directory where index files should be created
125
124
-nodesfile => name of file containing nodes (nodes.dmp from NCBI)
126
125
-namesfile => name of the file containing names(names.dmp from NCBI)
127
-force => 1 replace current indexes even if they exist
126
-force => 1 to replace current indexes even if they exist
132
my($class,@args) = @_;
131
my($class, @args) = @_;
134
133
my $self = $class->SUPER::new(@args);
135
my ($dir,$nodesfile,$namesfile,$force) = $self->_rearrange([qw
136
(DIRECTORY NODESFILE NAMESFILE FORCE)], @args);
134
my ($dir,$nodesfile,$namesfile,$force) =
135
$self->_rearrange([qw(DIRECTORY NODESFILE NAMESFILE FORCE)], @args);
138
137
$self->index_directory($dir || $DEFAULT_INDEX_DIR);
139
138
if ( $nodesfile ) {
140
$self->_build_index($nodesfile,$namesfile,$force);
139
$self->_build_index($nodesfile,$namesfile,$force);
143
142
$self->_db_connect;
147
=head2 Bio::DB::Taxonomy Interface implementation
147
=head2 Bio::DB::Taxonomy interface implementation
152
Usage : my $num = $db->get_num_taxa();
153
Function: Get the number of taxa stored in the database.
161
if (not exists $self->{_num_taxa}) {
163
while ( my ($parent, undef) = each %{$self->{_parent2children}} ) {
166
$self->{_num_taxa} = $num;
168
return $self->{_num_taxa};
153
174
Title : get_taxon
249
272
sub get_Children_Taxids {
250
my ($self,$node) = @_;
273
my ($self, $node) = @_;
251
274
$self->warn("get_Children_Taxids is deprecated, use each_Descendent instead");
253
276
if( ref($node) ) {
254
277
if( $node->can('object_id') ) {
255
$id = $node->object_id;
278
$id = $node->object_id;
256
279
} elsif( $node->can('ncbi_taxid') ) {
257
$id = $node->ncbi_taxid;
280
$id = $node->ncbi_taxid;
259
$self->warn("Don't know how to extract a taxon id from the object of type ".ref($node)."\n");
282
$self->warn("Don't know how to extract a taxon id from the object of type ".ref($node)."\n");
262
285
} else { $id = $node }
263
286
my @vals = $self->{'_parentbtree'}->get_dup($id);
307
332
$self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
308
333
$self->throw("The supplied Taxon must belong to this database") unless $taxon->db_handle && $taxon->db_handle eq $self;
309
334
my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
311
336
my @desc_ids = $self->{'_parentbtree'}->get_dup($id);
313
338
foreach my $desc_id (@desc_ids) {
314
339
push(@descs, $self->get_taxon($desc_id) || next);
319
345
=head2 Helper methods
323
349
# internal method which does the indexing
324
350
sub _build_index {
325
my ($self,$nodesfile,$namesfile,$force) = @_;
351
my ($self, $nodesfile, $namesfile, $force) = @_;
327
my ($dir) = ($self->index_directory);
328
my $nodeindex = "$dir/$DEFAULT_NODE_INDEX";
329
my $name2idindex = "$dir/$DEFAULT_NAME2ID_INDEX";
330
my $id2nameindex = "$dir/$DEFAULT_ID2NAME_INDEX";
331
my $parent2childindex = "$dir/$DEFAULT_PARENT_INDEX";
332
$self->{'_nodes'} = [];
333
$self->{'_id2name'} = [];
334
$self->{'_name2id'} = {};
353
my $dir = $self->index_directory;
354
my $nodeindex = catfile($dir, $DEFAULT_NODE_INDEX);
355
my $name2idindex = catfile($dir, $DEFAULT_NAME2ID_INDEX);
356
my $id2nameindex = catfile($dir, $DEFAULT_ID2NAME_INDEX);
357
my $parent2childindex = catfile($dir, $DEFAULT_PARENT_INDEX);
358
$self->{'_nodes'} = [];
359
$self->{'_id2name'} = [];
360
$self->{'_name2id'} = {};
335
361
$self->{'_parent2children'} = {};
337
363
if (! -e $nodeindex || $force) {
342
368
unlink $nodeindex;
343
369
unlink $parent2childindex;
344
370
my $nh = tie ( @nodes, 'DB_File', $nodeindex, O_RDWR|O_CREAT, 0644, $DB_RECNO) ||
345
$self->throw("Cannot open file '$nodeindex': $!");
371
$self->throw("Cannot open file '$nodeindex': $!");
346
372
my $btree = tie( %parent2children, 'DB_File', $parent2childindex, O_RDWR|O_CREAT, 0644, $DB_BTREE) ||
347
$self->throw("Cannot open file '$parent2childindex': $!");
373
$self->throw("Cannot tie to file '$parent2childindex': $!");
349
375
while (<NODES>) {
351
378
my ($taxid,$parent,$rank,$code,$divid,undef,$gen_code,undef,$mito) = split(/\t\|\t/,$_);
352
# don't include the fake root node 'root' with id 1; we essentially have multiple roots here
379
# don't include the fake root node 'root' with id 1; we essentially have multiple roots here
358
385
# keep this stringified
359
386
$nodes[$taxid] = join(SEPARATOR, ($taxid,$parent,$rank,$code,$divid,$gen_code,$mito));
360
387
$btree->put($parent,$taxid);
374
401
unlink $id2nameindex;
375
402
my (@id2name,%name2id);
376
403
my $idh = tie (@id2name, 'DB_File', $id2nameindex, O_RDWR|O_CREAT, 0644, $DB_RECNO) ||
377
$self->throw("Cannot open file '$id2nameindex': $!");
404
$self->throw("Cannot tie to file '$id2nameindex': $!");
378
405
my $nameh = tie ( %name2id, 'DB_File', $name2idindex, O_RDWR|O_CREAT, 0644, $DB_HASH) ||
379
$self->throw("Cannot open file '$name2idindex': $!");
406
$self->throw("Cannot tie to file '$name2idindex': $!");
381
408
while (<NAMES>) {
383
411
my ($taxid, $name, $unique_name, $class) = split(/\t\|\t/,$_);
384
# don't include the fake root node 'root' or 'all' with id 1
412
# don't include the fake root node 'root' or 'all' with id 1
387
415
$class =~ s/\s+\|\s*$//;
388
416
my $lc_name = lc($name);
389
417
my $orig_name = $name;
438
467
# connect the internal db handle
439
468
sub _db_connect {
440
469
my $self = shift;
441
470
return if $self->{'_initialized'};
443
$self->{'_nodes'} = [];
444
$self->{'_id2name'} = [];
445
$self->{'_name2id'} = {};
447
my ($dir) = ($self->index_directory);
448
my $nodeindex = "$dir/$DEFAULT_NODE_INDEX";
449
my $name2idindex = "$dir/$DEFAULT_NAME2ID_INDEX";
450
my $id2nameindex = "$dir/$DEFAULT_ID2NAME_INDEX";
451
my $parent2childindex = "$dir/$DEFAULT_PARENT_INDEX";
472
my $dir = $self->index_directory;
473
my $nodeindex = catfile($dir, $DEFAULT_NODE_INDEX);
474
my $name2idindex = catfile($dir, $DEFAULT_NAME2ID_INDEX);
475
my $id2nameindex = catfile($dir, $DEFAULT_ID2NAME_INDEX);
476
my $parent2childindex = catfile($dir, $DEFAULT_PARENT_INDEX);
477
$self->{'_nodes'} = [];
478
$self->{'_id2name'} = [];
479
$self->{'_name2id'} = {};
480
$self->{'_parent2children'} = {};
453
482
if( ! -e $nodeindex ||
454
! -e $name2idindex ||
455
! -e $id2nameindex ) {
456
$self->warn("Index files have not been created");
483
! -e $name2idindex ||
484
! -e $id2nameindex ) {
485
$self->warn("Index files have not been created");
459
488
tie ( @{$self->{'_nodes'}}, 'DB_File', $nodeindex, O_RDWR,undef, $DB_RECNO)
460
|| $self->throw("$! $nodeindex");
489
|| $self->throw("$! $nodeindex");
461
490
tie (@{$self->{'_id2name'}}, 'DB_File', $id2nameindex,O_RDWR, undef,
462
$DB_RECNO) || $self->throw("$! $id2nameindex");
491
$DB_RECNO) || $self->throw("$! $id2nameindex");
464
493
tie ( %{$self->{'_name2id'}}, 'DB_File', $name2idindex, O_RDWR,undef,
465
$DB_HASH) || $self->throw("$! $name2idindex");
494
$DB_HASH) || $self->throw("$! $name2idindex");
466
495
$self->{'_parentbtree'} = tie( %{$self->{'_parent2children'}},
467
'DB_File', $parent2childindex,
468
O_RDWR, 0644, $DB_BTREE);
469
$self->{'_initialized'} = 1;
496
'DB_File', $parent2childindex,
497
O_RDWR, 0644, $DB_BTREE);
499
$self->{'_initialized'} = 1;