259
my ($self,@args) = @_;
259
my ( $self, @args ) = @_;
261
261
my $builder = $self->sequence_builder();
268
my (@acc, @features);
269
my ($display_id, $annotation);
272
# initialize; we may come here because of starting over
277
%params = (-verbose => $self->verbose); # reset hash
279
while(defined($buffer = $self->_readline())) {
280
last if index($buffer,'LOCUS ') == 0;
282
return unless defined $buffer; # end of file
283
$buffer =~ /^LOCUS\s+(\S.*)$/o ||
284
$self->throw("GenBank stream with bad LOCUS line. Not GenBank in my book. Got '$buffer'");
286
my @tokens = split(' ', $1);
288
# this is important to have the id for display in e.g. FTHelper,
289
# otherwise you won't know which entry caused an error
290
$display_id = shift(@tokens);
291
$params{'-display_id'} = $display_id;
292
# may still be useful if we don't want the seq
293
my $seqlength = shift(@tokens);
294
if (exists $VALID_ALPHABET{$seqlength}) {
295
# moved one token too far. No locus name?
296
$self->warn("Bad LOCUS name? Changing [$params{'-display_id'}] to 'unknown' and length to $display_id");
297
$params{'-display_id'} = 'unknown';
298
$params{'-length'} = $display_id;
300
unshift @tokens, $seqlength;
302
$params{'-length'} = $seqlength;
304
# the alphabet of the entry
305
# shouldn't assign alphabet unless one is specifically designated (such as for rc files)
306
my $alphabet = lc(shift @tokens);
307
$params{'-alphabet'} = (exists $VALID_ALPHABET{$alphabet}) ? $VALID_ALPHABET{$alphabet} :
308
$self->warn("Unknown alphabet: $alphabet");
309
# for aa there is usually no 'molecule' (mRNA etc)
310
if ($params{'-alphabet'} eq 'protein') {
311
$params{'-molecule'} = 'PRT'
313
$params{'-molecule'} = shift(@tokens);
315
# take care of lower case issues
316
if ($params{'-molecule'} eq 'dna' || $params{'-molecule'} eq 'rna') {
317
$params{'-molecule'} = uc $params{'-molecule'};
319
$self->debug("Unrecognized molecule type:".$params{'-molecule'}) if
320
!exists($VALID_MOLTYPE{$params{'-molecule'}});
321
my $circ = shift(@tokens);
322
if ($circ eq 'circular') {
323
$params{'-is_circular'} = 1;
324
$params{'-division'} = shift(@tokens);
326
# 'linear' or 'circular' may actually be omitted altogether
327
$params{'-division'} =
328
(CORE::length($circ) == 3 ) ? $circ : shift(@tokens);
330
my $date = join(' ', @tokens); # we lump together the rest
332
# this is per request bug #1513
338
if($date =~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/) {
339
if( length($date) < 11 ) {
340
# improperly formatted date
341
# But we'll be nice and fix it for them
342
my ($d,$m,$y) = ($2,$3,$4);
343
if( length($d) == 1 ) {
346
# guess the century here
347
if( length($y) == 2 ) {
348
if( $y > 60 ) { # arbitrarily guess that '60' means 1960
353
$self->warn("Date was malformed, guessing the century for $date to be $y\n");
355
$params{'-dates'} = [join('-',$d,$m,$y)];
357
$params{'-dates'} = [$date];
360
# set them all at once
361
$builder->add_slot_value(%params);
364
# parse the rest if desired, otherwise start over
365
if(! $builder->want_object()) {
366
$builder->make_object();
370
# set up annotation depending on what the builder wants
371
if($builder->want_slot('annotation')) {
372
$annotation = Bio::Annotation::Collection->new();
374
$buffer = $self->_readline();
375
until( !defined ($buffer) ) {
377
# Description line(s)
378
if (/^DEFINITION\s+(\S.*\S)/) {
380
while ( defined($_ = $self->_readline) ) {
381
if( /^\s+(.*)/ ) { push (@desc, $1); next };
384
$builder->add_slot_value(-desc => join(' ', @desc));
385
# we'll continue right here because DEFINITION always comes
386
# at the top of the entry
389
# accession number (there can be multiple accessions)
390
if( /^ACCESSION\s+(\S.*\S)/ ) {
391
push(@acc, split(/\s+/,$1));
392
while( defined($_ = $self->_readline) ) {
393
/^\s+(.*)/ && do { push (@acc, split(/\s+/,$1)); next };
400
elsif( /^PID\s+(\S+)/ ) {
401
$params{'-pid'} = $1;
404
elsif( /^VERSION\s+(\S.+)$/ ) {
405
my ($acc,$gi) = split(' ',$1);
406
if($acc =~ /^\w+\.(\d+)/) {
407
$params{'-version'} = $1;
408
$params{'-seq_version'} = $1;
410
if($gi && (index($gi,"GI:") == 0)) {
411
$params{'-primary_id'} = substr($gi,3);
415
elsif( /^KEYWORDS\s+(\S.*)/ ) {
416
my @kw = split(/\s*\;\s*/,$1);
417
while( defined($_ = $self->_readline) ) {
419
/^\s+(.*)/ && do { push (@kw, split(/\s*\;\s*/,$1)); next };
423
@kw && $kw[-1] =~ s/\.$//;
424
$params{'-keywords'} = \@kw;
428
# Organism name and phylogenetic information
429
elsif (/^SOURCE\s+\S/) {
430
if($builder->want_slot('species')) {
431
$species = $self->_read_GenBank_Species(\$buffer);
432
$builder->add_slot_value(-species => $species);
434
while(defined($buffer = $self->_readline())) {
435
last if substr($buffer,0,1) ne ' ';
441
elsif (/^REFERENCE\s+\S/) {
443
my @refs = $self->_read_GenBank_References(\$buffer);
444
foreach my $ref ( @refs ) {
445
$annotation->add_Annotation('reference',$ref);
448
while(defined($buffer = $self->_readline())) {
449
last if substr($buffer,0,1) ne ' ';
455
elsif (/^PROJECT\s+(\S.*)/) {
457
my $project = Bio::Annotation::SimpleValue->new(-value => $1);
458
$annotation->add_Annotation('project',$project);
462
elsif (/^COMMENT\s+(\S.*)/) {
465
while (defined($_ = $self->_readline)) {
469
$comment =~ s/\n/ /g;
470
$comment =~ s/ +/ /g;
471
$annotation->add_Annotation('comment',
472
Bio::Annotation::Comment->new(-text => $comment,
473
-tagname => 'comment'));
476
while(defined($buffer = $self->_readline())) {
477
last if substr($buffer,0,1) ne ' ';
482
# Corresponding Genbank nucleotide id, Genpept only
483
elsif( /^DB(?:SOURCE|LINK)\s+(\S.+)/ ) {
486
while (defined($_ = $self->_readline)) {
490
# deal with UniProKB dbsources
491
if( $dbsource =~ s/(UniProt(?:KB)?|swissprot):\s+locus\s+(\S+)\,.+\n// ) {
492
$annotation->add_Annotation
494
Bio::Annotation::DBLink->new
497
-tagname => 'dblink'));
498
if( $dbsource =~ s/\s+created:\s+([^\.]+)\.\n// ) {
499
$annotation->add_Annotation
501
Bio::Annotation::SimpleValue->new
502
(-tagname => 'date_created',
505
while( $dbsource =~ s/\s+(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g ) {
506
$annotation->add_Annotation
508
Bio::Annotation::SimpleValue->new
509
(-tagname => 'date_updated',
512
$dbsource =~ s/\n/ /g;
513
if( $dbsource =~ s/\s+xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ ) {
514
# will use $i to determine even or odd
515
# for swissprot the accessions are paired
517
for my $dbsrc ( split(/,\s+/,$1) ) {
518
if( $dbsrc =~ /(\S+)\.(\d+)/ ||
519
$dbsrc =~ /(\S+)/ ) {
520
my ($id,$version) = ($1,$2);
521
$version ='' unless defined $version;
523
if( $id =~ /^\d\S{3}/) {
526
$db = ($i++ % 2 ) ? 'GenPept' : 'GenBank';
528
$annotation->add_Annotation
530
Bio::Annotation::DBLink->new
532
-version => $version,
534
-tagname => 'dblink'));
537
} elsif( $dbsource =~ s/\s+xrefs:\s+(.+)\s+xrefs/xrefs/i ) {
538
# download screwed up and ncbi didn't put acc in for gi numbers
540
for my $id ( split(/\,\s+/,$1) ) {
542
if( $id =~ /gi:\s+(\d+)/ ) {
544
$db = ($i++ % 2 ) ? 'GenPept' : 'GenBank';
545
} elsif( $id =~ /pdb\s+accession\s+(\S+)/ ) {
552
$annotation->add_Annotation
554
Bio::Annotation::DBLink->new
555
(-primary_id => $acc,
557
-tagname => 'dblink'));
560
$self->debug("Cannot match $dbsource\n");
562
if( $dbsource =~ s/xrefs\s+\(non\-sequence\s+databases\):\s+
563
((?:\S+,\s+)+\S+)//x ) {
564
for my $id ( split(/\,\s+/,$1) ) {
566
# this is because GenBank dropped the spaces!!!
567
# I'm sure we're not going to get this right
568
##if( $id =~ s/^://i ) {
571
$db = substr($id,0,index($id,':'));
572
if (! exists $DBSOURCE{ $db }) {
573
$db = ''; # do we want 'GenBank' here?
575
$id = substr($id,index($id,':')+1);
576
$annotation->add_Annotation
578
Bio::Annotation::DBLink->new
581
-tagname => 'dblink'));
586
if( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)\.(\d+)/ ) {
587
my ($db,$id,$version) = ($1,$2,$3);
588
$annotation->add_Annotation
590
Bio::Annotation::DBLink->new
592
-version => $version,
593
-database => $db || 'GenBank',
594
-tagname => 'dblink'));
595
} elsif ( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)/ ) {
596
my ($db,$id) = ($1,$2);
597
$annotation->add_Annotation
599
Bio::Annotation::DBLink->new
601
-database => $db || 'GenBank',
602
-tagname => 'dblink'));
603
} elsif ( $dbsource =~ /(\S+)([\.:])\s*(\S+)/ ) {
608
# Genbank 192 release notes say this: "The second field can consist of
609
# multiple comma-separated identifiers, if a sequence record has
610
# multiple DBLINK cross-references of a given type."
611
# For example: DBLINK Project:100,200,300"
612
@ids = split (/,/, $3);
614
($db, $version) = ('GenBank', $3);
618
foreach my $id (@ids) {
619
$annotation->add_Annotation('dblink',
268
my ( @acc, @features );
269
my ( $display_id, $annotation );
272
# initialize; we may come here because of starting over
277
%params = ( -verbose => $self->verbose ); # reset hash
279
while ( defined( $buffer = $self->_readline() ) ) {
280
last if index( $buffer, 'LOCUS ' ) == 0;
282
return unless defined $buffer; # end of file
283
$buffer =~ /^LOCUS\s+(\S.*)$/o
285
"GenBank stream with bad LOCUS line. Not GenBank in my book. Got '$buffer'"
288
my @tokens = split( ' ', $1 );
290
# this is important to have the id for display in e.g. FTHelper,
291
# otherwise you won't know which entry caused an error
292
$display_id = shift(@tokens);
293
$params{'-display_id'} = $display_id;
295
# may still be useful if we don't want the seq
296
my $seqlength = shift(@tokens);
297
if ( exists $VALID_ALPHABET{$seqlength} ) {
299
# moved one token too far. No locus name?
301
"Bad LOCUS name? Changing [$params{'-display_id'}] to 'unknown' and length to $display_id"
303
$params{'-display_id'} = 'unknown';
304
$params{'-length'} = $display_id;
307
unshift @tokens, $seqlength;
310
$params{'-length'} = $seqlength;
313
# the alphabet of the entry
314
# shouldn't assign alphabet unless one is specifically designated (such as for rc files)
315
my $alphabet = lc( shift @tokens );
316
$params{'-alphabet'} =
317
( exists $VALID_ALPHABET{$alphabet} )
318
? $VALID_ALPHABET{$alphabet}
319
: $self->warn("Unknown alphabet: $alphabet");
321
# for aa there is usually no 'molecule' (mRNA etc)
322
if ( $params{'-alphabet'} eq 'protein' ) {
323
$params{'-molecule'} = 'PRT';
326
$params{'-molecule'} = shift(@tokens);
329
# take care of lower case issues
330
if ( $params{'-molecule'} eq 'dna' || $params{'-molecule'} eq 'rna' ) {
331
$params{'-molecule'} = uc $params{'-molecule'};
333
$self->debug( "Unrecognized molecule type:" . $params{'-molecule'} )
334
if !exists( $VALID_MOLTYPE{ $params{'-molecule'} } );
335
my $circ = shift(@tokens);
336
if ( $circ eq 'circular' ) {
337
$params{'-is_circular'} = 1;
338
$params{'-division'} = shift(@tokens);
341
# 'linear' or 'circular' may actually be omitted altogether
342
$params{'-division'} =
343
( CORE::length($circ) == 3 ) ? $circ : shift(@tokens);
345
my $date = join( ' ', @tokens ); # we lump together the rest
347
# this is per request bug #1513
353
if ( $date =~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/ ) {
354
if ( length($date) < 11 ) {
356
# improperly formatted date
357
# But we'll be nice and fix it for them
358
my ( $d, $m, $y ) = ( $2, $3, $4 );
359
if ( length($d) == 1 ) {
363
# guess the century here
364
if ( length($y) == 2 ) {
365
if ( $y > 60 ) { # arbitrarily guess that '60' means 1960
372
"Date was malformed, guessing the century for $date to be $y\n"
375
$params{'-dates'} = [ join( '-', $d, $m, $y ) ];
378
$params{'-dates'} = [$date];
382
# set them all at once
383
$builder->add_slot_value(%params);
386
# parse the rest if desired, otherwise start over
387
if ( !$builder->want_object() ) {
388
$builder->make_object();
392
# set up annotation depending on what the builder wants
393
if ( $builder->want_slot('annotation') ) {
394
$annotation = Bio::Annotation::Collection->new();
396
$buffer = $self->_readline();
397
until ( !defined($buffer) ) {
400
# Description line(s)
401
if (/^DEFINITION\s+(\S.*\S)/) {
403
while ( defined( $_ = $self->_readline ) ) {
404
if (/^\s+(.*)/) { push( @desc, $1 ); next }
407
$builder->add_slot_value( -desc => join( ' ', @desc ) );
409
# we'll continue right here because DEFINITION always comes
410
# at the top of the entry
414
# accession number (there can be multiple accessions)
415
if (/^ACCESSION\s+(\S.*\S)/) {
416
push( @acc, split( /\s+/, $1 ) );
417
while ( defined( $_ = $self->_readline ) ) {
418
/^\s+(.*)/ && do { push( @acc, split( /\s+/, $1 ) ); next };
426
elsif (/^PID\s+(\S+)/) {
427
$params{'-pid'} = $1;
431
elsif (/^VERSION\s+(\S.+)$/) {
432
my ( $acc, $gi ) = split( ' ', $1 );
433
if ( $acc =~ /^\w+\.(\d+)/ ) {
434
$params{'-version'} = $1;
435
$params{'-seq_version'} = $1;
437
if ( $gi && ( index( $gi, "GI:" ) == 0 ) ) {
438
$params{'-primary_id'} = substr( $gi, 3 );
443
elsif (/^KEYWORDS\s+(\S.*)/) {
444
my @kw = split( /\s*\;\s*/, $1 );
445
while ( defined( $_ = $self->_readline ) ) {
448
&& do { push( @kw, split( /\s*\;\s*/, $1 ) ); next };
452
@kw && $kw[-1] =~ s/\.$//;
453
$params{'-keywords'} = \@kw;
458
# Organism name and phylogenetic information
459
elsif (/^SOURCE\s+\S/) {
460
if ( $builder->want_slot('species') ) {
461
$species = $self->_read_GenBank_Species( \$buffer );
462
$builder->add_slot_value( -species => $species );
465
while ( defined( $buffer = $self->_readline() ) ) {
466
last if substr( $buffer, 0, 1 ) ne ' ';
473
elsif (/^REFERENCE\s+\S/) {
475
my @refs = $self->_read_GenBank_References( \$buffer );
476
foreach my $ref (@refs) {
477
$annotation->add_Annotation( 'reference', $ref );
481
while ( defined( $buffer = $self->_readline() ) ) {
482
last if substr( $buffer, 0, 1 ) ne ' ';
489
elsif (/^PROJECT\s+(\S.*)/) {
492
Bio::Annotation::SimpleValue->new( -value => $1 );
493
$annotation->add_Annotation( 'project', $project );
498
elsif (/^COMMENT\s+(\S.*)/) {
501
while ( defined( $_ = $self->_readline ) ) {
505
$comment =~ s/\n/ /g;
506
$comment =~ s/ +/ /g;
507
$annotation->add_Annotation(
509
Bio::Annotation::Comment->new(
511
-tagname => 'comment'
517
while ( defined( $buffer = $self->_readline() ) ) {
518
last if substr( $buffer, 0, 1 ) ne ' ';
524
# Corresponding Genbank nucleotide id, Genpept only
525
elsif (/^DB(?:SOURCE|LINK)\s+(\S.+)/) {
528
while ( defined( $_ = $self->_readline ) ) {
533
# deal with UniProKB dbsources
535
s/(UniProt(?:KB)?|swissprot):\s+locus\s+(\S+)\,.+\n// )
537
$annotation->add_Annotation(
620
539
Bio::Annotation::DBLink->new(
622
-version => $version,
624
-tagname => 'dblink')
628
$self->warn("Unrecognized DBSOURCE data: $dbsource\n");
634
while(defined($buffer = $self->_readline())) {
635
last if substr($buffer,0,1) ne ' ';
640
# Exit at start of Feature table, or start of sequence
641
last if( /^(FEATURES|ORIGIN)/ );
642
# Get next line and loop again
643
$buffer = $self->_readline;
645
return unless defined $buffer;
647
# add them all at once for efficiency
648
$builder->add_slot_value(-accession_number => shift(@acc),
649
-secondary_accessions => \@acc,
651
$builder->add_slot_value(-annotation => $annotation) if $annotation;
652
%params = (); # reset before possible re-use to avoid setting twice
654
# start over if we don't want to continue with this entry
655
if(! $builder->want_object()) {
656
$builder->make_object();
659
# some "minimal" formats may not necessarily have a feature table
660
if($builder->want_slot('features') && defined($_) && /^FEATURES/o) {
661
# need to read the first line of the feature table
662
$buffer = $self->_readline;
663
# DO NOT read lines in the while condition -- this is done as a side
664
# effect in _read_FTHelper_GenBank!
666
# part of new circular spec:
667
# commented out for now until kinks worked out
669
#$sourceEnd = $2 if ($buffer =~ /(\d+?)\.\.(\d+?)$/);
671
while( defined($buffer) ) {
672
# check immediately -- not at the end of the loop
673
# note: GenPept entries obviously do not have a BASE line
674
last if( $buffer =~ /^BASE|ORIGIN|CONTIG|WGS/o);
676
# slurp in one feature at a time -- at return, the start of
677
# the next feature will have been read already, so we need
678
# to pass a reference, and the called method must set this
679
# to the last line read before returning
681
my $ftunit = $self->_read_FTHelper_GenBank(\$buffer);
683
# implement new circular spec: features that cross the origin are now
684
# seamless instead of being 2 separate joined features
685
# commented out until kinks get worked out
686
#if ((! $args{'-nojoin'}) && $ftunit->{'loc'} =~ /^join\((\d+?)\.\.(\d+?),(\d+?)..(\d+?)\)$/
687
#&& $sourceEnd == $2 && $3 == 1) {
690
#$ftunit->{'loc'} = "$start..$end";
693
# fix suggested by James Diggans
695
if( !defined $ftunit ) {
696
# GRRRR. We have fallen over. Try to recover
697
$self->warn("Unexpected error in feature table for ".$params{'-display_id'}." Skipping feature, attempting to recover");
698
unless( ($buffer =~ /^\s{5,5}\S+/o) or
699
($buffer =~ /^\S+/o)) {
700
$buffer = $self->_readline();
702
next; # back to reading FTHelpers
707
$ftunit->_generic_seqfeature($self->location_factory(),
709
# add taxon_id from source if available
710
if($species && ($feat->primary_tag eq 'source') &&
711
$feat->has_tag('db_xref') && (! $species->ncbi_taxid() ||
712
($species->ncbi_taxid && $species->ncbi_taxid =~ /^list/))) {
713
foreach my $tagval ($feat->get_tag_values('db_xref')) {
714
if(index($tagval,"taxon:") == 0) {
715
$species->ncbi_taxid(substr($tagval,6));
720
# add feature to list of features
721
push(@features, $feat);
723
$builder->add_slot_value(-features => \@features);
730
while($_ !~ m{^//}) { # end of file
731
$_ =~ /^(?:CONTIG)?\s+(.*)/;
733
$_ = $self->_readline;
736
$annotation->add_Annotation(
737
Bio::Annotation::SimpleValue->new(-tagname => 'contig',
741
$self->_pushback($_);
742
} elsif( /^WGS|WGS_SCAFLD\s+/o ) { # catch WGS/WGS_SCAFLD lines
743
while($_ =~ s/(^WGS|WGS_SCAFLD)\s+//){ # gulp lines
745
$annotation->add_Annotation(
746
Bio::Annotation::SimpleValue->new(-value => $_,
747
-tagname => lc($1)));
748
$_ = $self->_readline;
750
} elsif(! m{^(ORIGIN|//)} ) { # advance to the sequence, if any
751
while (defined( $_ = $self->_readline) ) {
752
last if m{^(ORIGIN|//)};
756
if(! $builder->want_object()) {
757
$builder->make_object(); # implicit end-of-object
760
if($builder->want_slot('seq')) {
761
# the fact that we want a sequence does not necessarily mean that
762
# there also is a sequence ...
763
if(defined($_) && s/^ORIGIN\s+//) {
765
if( $annotation && length($_) > 0 ) {
766
$annotation->add_Annotation('origin',
767
Bio::Annotation::SimpleValue->new(-tagname => 'origin',
771
while( defined($_ = $self->_readline) ) {
777
#$self->debug("sequence length is ". length($seqc) ."\n");
778
$builder->add_slot_value(-seq => $seqc);
780
} elsif ( defined($_) && (substr($_,0,2) ne '//')) {
781
# advance to the end of the record
782
while( defined($_ = $self->_readline) ) {
783
last if substr($_,0,2) eq '//';
786
# Unlikely, but maybe the sequence is so weird that we don't want it
787
# anymore. We don't want to return undef if the stream's not exhausted
789
$seq = $builder->make_object();
790
next RECORDSTART unless $seq;
792
} # end while RECORDSTART
545
if ( $dbsource =~ s/\s+created:\s+([^\.]+)\.\n// ) {
546
$annotation->add_Annotation(
548
Bio::Annotation::SimpleValue->new(
549
-tagname => 'date_created',
555
s/\s+(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g
558
$annotation->add_Annotation(
560
Bio::Annotation::SimpleValue->new(
561
-tagname => 'date_updated',
566
$dbsource =~ s/\n/ /g;
568
s/\s+xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ )
570
# will use $i to determine even or odd
571
# for swissprot the accessions are paired
573
for my $dbsrc ( split( /,\s+/, $1 ) ) {
574
if ( $dbsrc =~ /(\S+)\.(\d+)/
575
|| $dbsrc =~ /(\S+)/ )
577
my ( $id, $version ) = ( $1, $2 );
578
$version = '' unless defined $version;
580
if ( $id =~ /^\d\S{3}/ ) {
585
( $i++ % 2 ) ? 'GenPept' : 'GenBank';
587
$annotation->add_Annotation(
589
Bio::Annotation::DBLink->new(
591
-version => $version,
600
$dbsource =~ s/\s+xrefs:\s+(.+)\s+xrefs/xrefs/i )
602
# download screwed up and ncbi didn't put acc in for gi numbers
604
for my $id ( split( /\,\s+/, $1 ) ) {
606
if ( $id =~ /gi:\s+(\d+)/ ) {
608
$db = ( $i++ % 2 ) ? 'GenPept' : 'GenBank';
610
elsif ( $id =~ /pdb\s+accession\s+(\S+)/ ) {
618
$annotation->add_Annotation(
620
Bio::Annotation::DBLink->new(
629
$self->debug("Cannot match $dbsource\n");
633
s/xrefs\s+\(non\-sequence\s+databases\):\s+
637
for my $id ( split( /\,\s+/, $1 ) ) {
640
# this is because GenBank dropped the spaces!!!
641
# I'm sure we're not going to get this right
642
##if( $id =~ s/^://i ) {
645
$db = substr( $id, 0, index( $id, ':' ) );
646
if ( !exists $DBSOURCE{$db} ) {
647
$db = ''; # do we want 'GenBank' here?
649
$id = substr( $id, index( $id, ':' ) + 1 );
650
$annotation->add_Annotation(
652
Bio::Annotation::DBLink->new(
664
/^(\S*?):?\s*accession\s+(\S+)\.(\d+)/ )
666
my ( $db, $id, $version ) = ( $1, $2, $3 );
667
$annotation->add_Annotation(
669
Bio::Annotation::DBLink->new(
671
-version => $version,
672
-database => $db || 'GenBank',
677
elsif ( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)/ ) {
678
my ( $db, $id ) = ( $1, $2 );
679
$annotation->add_Annotation(
681
Bio::Annotation::DBLink->new(
683
-database => $db || 'GenBank',
688
elsif ( $dbsource =~ /(\S+)([\.:])\s*(\S+)/ ) {
689
my ( $db, $version );
694
# Genbank 192 release notes say this: "The second field can consist of
695
# multiple comma-separated identifiers, if a sequence record has
696
# multiple DBLINK cross-references of a given type."
697
# For example: DBLINK Project:100,200,300"
698
@ids = split( /,/, $3 );
701
( $db, $version ) = ( 'GenBank', $3 );
705
foreach my $id (@ids) {
706
$annotation->add_Annotation(
708
Bio::Annotation::DBLink->new(
710
-version => $version,
719
"Unrecognized DBSOURCE data: $dbsource\n");
726
while ( defined( $buffer = $self->_readline() ) ) {
727
last if substr( $buffer, 0, 1 ) ne ' ';
733
# Exit at start of Feature table, or start of sequence
734
last if (/^(FEATURES|ORIGIN)/);
736
# Get next line and loop again
737
$buffer = $self->_readline;
739
return unless defined $buffer;
741
# add them all at once for efficiency
742
$builder->add_slot_value(
743
-accession_number => shift(@acc),
744
-secondary_accessions => \@acc,
747
$builder->add_slot_value( -annotation => $annotation ) if $annotation;
748
%params = (); # reset before possible re-use to avoid setting twice
750
# start over if we don't want to continue with this entry
751
if ( !$builder->want_object() ) {
752
$builder->make_object();
756
# some "minimal" formats may not necessarily have a feature table
757
if ( $builder->want_slot('features') && defined($_) && /^FEATURES/o ) {
759
# need to read the first line of the feature table
760
$buffer = $self->_readline;
762
# DO NOT read lines in the while condition -- this is done as a side
763
# effect in _read_FTHelper_GenBank!
765
# part of new circular spec:
766
# commented out for now until kinks worked out
768
#$sourceEnd = $2 if ($buffer =~ /(\d+?)\.\.(\d+?)$/);
770
while ( defined($buffer) ) {
772
# check immediately -- not at the end of the loop
773
# note: GenPept entries obviously do not have a BASE line
774
last if ( $buffer =~ /^BASE|ORIGIN|CONTIG|WGS/o );
776
# slurp in one feature at a time -- at return, the start of
777
# the next feature will have been read already, so we need
778
# to pass a reference, and the called method must set this
779
# to the last line read before returning
781
my $ftunit = $self->_read_FTHelper_GenBank( \$buffer );
783
# implement new circular spec: features that cross the origin are now
784
# seamless instead of being 2 separate joined features
785
# commented out until kinks get worked out
786
#if ((! $args{'-nojoin'}) && $ftunit->{'loc'} =~ /^join\((\d+?)\.\.(\d+?),(\d+?)..(\d+?)\)$/
787
#&& $sourceEnd == $2 && $3 == 1) {
790
#$ftunit->{'loc'} = "$start..$end";
793
# fix suggested by James Diggans
795
if ( !defined $ftunit ) {
797
# GRRRR. We have fallen over. Try to recover
798
$self->warn( "Unexpected error in feature table for "
799
. $params{'-display_id'}
800
. " Skipping feature, attempting to recover" );
801
unless ( ( $buffer =~ /^\s{5,5}\S+/o )
802
or ( $buffer =~ /^\S+/o ) )
804
$buffer = $self->_readline();
806
next; # back to reading FTHelpers
811
$ftunit->_generic_seqfeature( $self->location_factory(),
814
# add taxon_id from source if available
817
&& ( $feat->primary_tag eq 'source' )
818
&& $feat->has_tag('db_xref')
820
!$species->ncbi_taxid()
821
|| ( $species->ncbi_taxid
822
&& $species->ncbi_taxid =~ /^list/ )
826
foreach my $tagval ( $feat->get_tag_values('db_xref') ) {
827
if ( index( $tagval, "taxon:" ) == 0 ) {
828
$species->ncbi_taxid( substr( $tagval, 6 ) );
834
# add feature to list of features
835
push( @features, $feat );
837
$builder->add_slot_value( -features => \@features );
842
# CONTIG lines: TODO, this needs to be cleaned up
843
if (/^CONTIG\s+(.*)/o) {
845
while ( defined( $_ = $self->_readline)) {
846
last if m{^ORIGIN|//}o;
851
$annotation->add_Annotation(
852
Bio::Annotation::SimpleValue->new(
853
-tagname => 'contig',
859
elsif (/^WGS|WGS_SCAFLD\s+/o) { # catch WGS/WGS_SCAFLD lines
860
while ( $_ =~ s/(^WGS|WGS_SCAFLD)\s+// ) { # gulp lines
862
$annotation->add_Annotation(
863
Bio::Annotation::SimpleValue->new(
868
$_ = $self->_readline;
871
elsif ( !m{^ORIGIN|//}o ) { # advance to the sequence, if any
872
while ( defined( $_ = $self->_readline ) ) {
873
last if m{^(ORIGIN|//)};
877
if ( !$builder->want_object() ) {
878
$builder->make_object(); # implicit end-of-object
881
if ( $builder->want_slot('seq') ) {
882
# the fact that we want a sequence does not necessarily mean that
883
# there also is a sequence ...
884
if ( defined($_) && s/^ORIGIN\s+// ) {
885
if ( $annotation && length($_) > 0 ) {
886
$annotation->add_Annotation(
888
Bio::Annotation::SimpleValue->new(
889
-tagname => 'origin',
895
while ( defined( $_ = $self->_readline ) ) {
902
$builder->add_slot_value( -seq => $seqc );
905
elsif ( defined($_) && ( substr( $_, 0, 2 ) ne '//' ) ) {
907
# advance to the end of the record
908
while ( defined( $_ = $self->_readline ) ) {
909
last if substr( $_, 0, 2 ) eq '//';
913
# Unlikely, but maybe the sequence is so weird that we don't want it
914
# anymore. We don't want to return undef if the stream's not exhausted
916
$seq = $builder->make_object();
917
next RECORDSTART unless $seq;
919
} # end while RECORDSTART