~ubuntu-branches/ubuntu/trusty/bioperl/trusty

« back to all changes in this revision

Viewing changes to Bio/SeqIO/genbank.pm

  • Committer: Package Import Robot
  • Author(s): Charles Plessy
  • Date: 2014-01-18 11:41:11 UTC
  • mfrom: (3.1.12 sid)
  • Revision ID: package-import@ubuntu.com-20140118114111-zcjaq5edb49dhlat
Tags: 1.6.923-1
* New upstream release.
* Does not need non-free libmath-random-perl anymore.
* Build-depend on libmodule-build-perl (>= 0.420000).  Despite Lintian's
  warning that it is useless, the package does not build without.
* Conforms to Policy version 3.9.5.

Show diffs side-by-side

added added

removed removed

Lines of Context:
256
256
=cut
257
257
 
258
258
sub next_seq {
259
 
    my ($self,@args) = @_;
260
 
    my %args = @args;
 
259
    my ( $self, @args ) = @_;
 
260
    my %args    = @args;
261
261
    my $builder = $self->sequence_builder();
262
262
    my $seq;
263
263
    my %params;
264
264
 
265
265
  RECORDSTART:
266
266
    while (1) {
267
 
        my $buffer;
268
 
        my (@acc, @features);
269
 
        my ($display_id, $annotation);
270
 
        my $species;
271
 
 
272
 
        # initialize; we may come here because of starting over
273
 
        @features = ();
274
 
        $annotation = undef;
275
 
        @acc = ();
276
 
        $species = undef;
277
 
        %params = (-verbose => $self->verbose); # reset hash
278
 
        local($/) = "\n";
279
 
        while(defined($buffer = $self->_readline())) {
280
 
            last if index($buffer,'LOCUS       ') == 0;
281
 
        }
282
 
        return unless defined $buffer; # end of file
283
 
        $buffer =~ /^LOCUS\s+(\S.*)$/o ||
284
 
            $self->throw("GenBank stream with bad LOCUS line. Not GenBank in my book. Got '$buffer'");
285
 
 
286
 
        my @tokens = split(' ', $1);
287
 
    
288
 
        # this is important to have the id for display in e.g. FTHelper,
289
 
        # otherwise you won't know which entry caused an error
290
 
        $display_id = shift(@tokens);
291
 
        $params{'-display_id'} = $display_id;
292
 
        # may still be useful if we don't want the seq
293
 
    my $seqlength = shift(@tokens);
294
 
    if (exists $VALID_ALPHABET{$seqlength}) {
295
 
        # moved one token too far.  No locus name?
296
 
        $self->warn("Bad LOCUS name?  Changing [$params{'-display_id'}] to 'unknown' and length to $display_id");
297
 
        $params{'-display_id'} = 'unknown';
298
 
        $params{'-length'} = $display_id;
299
 
        # add token back...
300
 
        unshift @tokens, $seqlength;
301
 
    } else {
302
 
        $params{'-length'} = $seqlength;
303
 
    }
304
 
        # the alphabet of the entry
305
 
    # shouldn't assign alphabet unless one is specifically designated (such as for rc files)
306
 
    my $alphabet = lc(shift @tokens);
307
 
        $params{'-alphabet'} = (exists $VALID_ALPHABET{$alphabet}) ? $VALID_ALPHABET{$alphabet} :
308
 
                           $self->warn("Unknown alphabet: $alphabet");
309
 
        # for aa there is usually no 'molecule' (mRNA etc)
310
 
    if ($params{'-alphabet'} eq 'protein') {
311
 
            $params{'-molecule'} = 'PRT'
312
 
    } else {
313
 
        $params{'-molecule'} = shift(@tokens);
314
 
    }
315
 
    # take care of lower case issues
316
 
    if ($params{'-molecule'} eq 'dna' || $params{'-molecule'} eq 'rna') {
317
 
        $params{'-molecule'} = uc $params{'-molecule'};
318
 
    }
319
 
    $self->debug("Unrecognized molecule type:".$params{'-molecule'}) if
320
 
        !exists($VALID_MOLTYPE{$params{'-molecule'}});
321
 
        my $circ = shift(@tokens);
322
 
    if ($circ eq 'circular') {
323
 
        $params{'-is_circular'} = 1;
324
 
        $params{'-division'} = shift(@tokens);
325
 
    } else {
326
 
        # 'linear' or 'circular' may actually be omitted altogether
327
 
        $params{'-division'} =
328
 
            (CORE::length($circ) == 3 ) ? $circ : shift(@tokens);
329
 
    }
330
 
        my $date = join(' ', @tokens); # we lump together the rest
331
 
 
332
 
        # this is per request bug #1513
333
 
        # we can handle
334
 
        # 9-10-2003
335
 
        # 9-10-03
336
 
        # 09-10-2003
337
 
        # 09-10-03
338
 
        if($date =~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/) {
339
 
            if( length($date) < 11 ) {
340
 
            # improperly formatted date
341
 
            # But we'll be nice and fix it for them
342
 
            my ($d,$m,$y) = ($2,$3,$4);
343
 
            if( length($d) == 1 ) {
344
 
                $d = "0$d";
345
 
            }
346
 
            # guess the century here
347
 
            if( length($y) == 2 ) {
348
 
                if( $y > 60 ) { # arbitrarily guess that '60' means 1960
349
 
                    $y = "19$y";
350
 
                } else {
351
 
                    $y = "20$y";
352
 
                }
353
 
                $self->warn("Date was malformed, guessing the century for $date to be $y\n");
354
 
            }
355
 
            $params{'-dates'} = [join('-',$d,$m,$y)];
356
 
        } else {
357
 
            $params{'-dates'} = [$date];
358
 
            }
359
 
        }
360
 
        # set them all at once
361
 
        $builder->add_slot_value(%params);
362
 
        %params = ();
363
 
 
364
 
        # parse the rest if desired, otherwise start over
365
 
        if(! $builder->want_object()) {
366
 
            $builder->make_object();
367
 
            next RECORDSTART;
368
 
        }
369
 
 
370
 
        # set up annotation depending on what the builder wants
371
 
        if($builder->want_slot('annotation')) {
372
 
            $annotation = Bio::Annotation::Collection->new();
373
 
        }
374
 
        $buffer = $self->_readline();
375
 
        until( !defined ($buffer) ) {
376
 
            $_ = $buffer;
377
 
            # Description line(s)
378
 
            if (/^DEFINITION\s+(\S.*\S)/) {
379
 
                my @desc = ($1);
380
 
                while ( defined($_ = $self->_readline) ) {
381
 
                    if( /^\s+(.*)/ ) { push (@desc, $1); next };
382
 
                    last;
383
 
                }
384
 
                $builder->add_slot_value(-desc => join(' ', @desc));
385
 
                                # we'll continue right here because DEFINITION always comes
386
 
                                # at the top of the entry
387
 
                $buffer= $_;
388
 
            }
389
 
            # accession number (there can be multiple accessions)
390
 
            if( /^ACCESSION\s+(\S.*\S)/ ) {
391
 
                push(@acc, split(/\s+/,$1));
392
 
                while( defined($_ = $self->_readline) ) {
393
 
                    /^\s+(.*)/ && do { push (@acc, split(/\s+/,$1)); next };
394
 
                    last;
395
 
                }
396
 
                $buffer = $_;
397
 
                next;
398
 
            }
399
 
            # PID
400
 
            elsif( /^PID\s+(\S+)/ ) {
401
 
                $params{'-pid'} = $1;
402
 
            }
403
 
            # Version number
404
 
            elsif( /^VERSION\s+(\S.+)$/ ) {
405
 
                my ($acc,$gi) = split(' ',$1);
406
 
                if($acc =~ /^\w+\.(\d+)/) {
407
 
                    $params{'-version'} = $1;
408
 
                    $params{'-seq_version'} = $1;
409
 
                }
410
 
                if($gi && (index($gi,"GI:") == 0)) {
411
 
                    $params{'-primary_id'} = substr($gi,3);
412
 
                }
413
 
            }
414
 
            # Keywords
415
 
            elsif( /^KEYWORDS\s+(\S.*)/ ) {
416
 
                my @kw = split(/\s*\;\s*/,$1);
417
 
                while( defined($_ = $self->_readline) ) {
418
 
                    chomp;
419
 
                    /^\s+(.*)/ && do { push (@kw, split(/\s*\;\s*/,$1)); next };
420
 
                    last;
421
 
                }
422
 
 
423
 
                @kw && $kw[-1] =~ s/\.$//;
424
 
                $params{'-keywords'} = \@kw;
425
 
                $buffer = $_;
426
 
                next;
427
 
            }
428
 
            # Organism name and phylogenetic information
429
 
            elsif (/^SOURCE\s+\S/) {
430
 
                if($builder->want_slot('species')) {
431
 
                    $species = $self->_read_GenBank_Species(\$buffer);
432
 
                    $builder->add_slot_value(-species => $species);
433
 
                } else {
434
 
                    while(defined($buffer = $self->_readline())) {
435
 
                        last if substr($buffer,0,1) ne ' ';
436
 
                    }
437
 
                }
438
 
                next;
439
 
            }
440
 
            # References
441
 
            elsif (/^REFERENCE\s+\S/) {
442
 
                if($annotation) {
443
 
                    my @refs = $self->_read_GenBank_References(\$buffer);
444
 
                    foreach my $ref ( @refs ) {
445
 
                        $annotation->add_Annotation('reference',$ref);
446
 
                    }
447
 
                } else {
448
 
                    while(defined($buffer = $self->_readline())) {
449
 
                        last if substr($buffer,0,1) ne ' ';
450
 
                    }
451
 
                }
452
 
                next;
453
 
            }
454
 
                # Project
455
 
                elsif (/^PROJECT\s+(\S.*)/) {
456
 
                        if ($annotation) {
457
 
                                my $project = Bio::Annotation::SimpleValue->new(-value => $1);
458
 
                                $annotation->add_Annotation('project',$project);
459
 
                        }
460
 
                }
461
 
            # Comments
462
 
            elsif (/^COMMENT\s+(\S.*)/) {
463
 
                if($annotation) {
464
 
                    my $comment = $1;
465
 
                    while (defined($_ = $self->_readline)) {
466
 
                        last if (/^\S/);
467
 
                        $comment .= $_;
468
 
                    }
469
 
                    $comment =~ s/\n/ /g;
470
 
                    $comment =~ s/  +/ /g;
471
 
                    $annotation->add_Annotation('comment',
472
 
                                                Bio::Annotation::Comment->new(-text => $comment,
473
 
                                                                              -tagname => 'comment'));
474
 
                    $buffer = $_;
475
 
                } else {
476
 
                    while(defined($buffer = $self->_readline())) {
477
 
                        last if substr($buffer,0,1) ne ' ';
478
 
                    }
479
 
                }
480
 
                next;
481
 
            }
482
 
            # Corresponding Genbank nucleotide id, Genpept only
483
 
            elsif( /^DB(?:SOURCE|LINK)\s+(\S.+)/ ) {
484
 
                if ($annotation) {
485
 
                    my $dbsource = $1;
486
 
                    while (defined($_ = $self->_readline)) {
487
 
                        last if (/^\S/);
488
 
                        $dbsource .= $_;
489
 
                    }
490
 
                                # deal with UniProKB dbsources
491
 
                    if( $dbsource =~ s/(UniProt(?:KB)?|swissprot):\s+locus\s+(\S+)\,.+\n// ) {
492
 
                        $annotation->add_Annotation
493
 
                            ('dblink',
494
 
                             Bio::Annotation::DBLink->new
495
 
                             (-primary_id => $2,
496
 
                              -database => $1,
497
 
                              -tagname => 'dblink'));
498
 
                        if( $dbsource =~ s/\s+created:\s+([^\.]+)\.\n// ) {
499
 
                            $annotation->add_Annotation
500
 
                                ('swissprot_dates',
501
 
                                 Bio::Annotation::SimpleValue->new
502
 
                                 (-tagname => 'date_created',
503
 
                                  -value => $1));
504
 
                        }
505
 
                        while( $dbsource =~ s/\s+(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g ) {
506
 
                            $annotation->add_Annotation
507
 
                                ('swissprot_dates',
508
 
                                 Bio::Annotation::SimpleValue->new
509
 
                                 (-tagname => 'date_updated',
510
 
                                  -value => $2));
511
 
                        }
512
 
                        $dbsource =~ s/\n/ /g;
513
 
                        if( $dbsource =~ s/\s+xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ ) {
514
 
                            # will use $i to determine even or odd
515
 
                            # for swissprot the accessions are paired
516
 
                            my $i = 0;
517
 
                            for my $dbsrc ( split(/,\s+/,$1) ) {
518
 
                                if( $dbsrc =~ /(\S+)\.(\d+)/ ||
519
 
                                    $dbsrc =~ /(\S+)/ ) {
520
 
                                    my ($id,$version) = ($1,$2);
521
 
                                    $version ='' unless defined $version;
522
 
                                    my $db;
523
 
                                    if( $id =~ /^\d\S{3}/) {
524
 
                                        $db = 'PDB';
525
 
                                    } else {
526
 
                                        $db = ($i++ % 2 ) ? 'GenPept' : 'GenBank';
527
 
                                    }
528
 
                                    $annotation->add_Annotation
529
 
                                        ('dblink',
530
 
                                         Bio::Annotation::DBLink->new
531
 
                                         (-primary_id => $id,
532
 
                                          -version => $version,
533
 
                                          -database => $db,
534
 
                                          -tagname => 'dblink'));
535
 
                                }
536
 
                            }
537
 
                        } elsif( $dbsource =~ s/\s+xrefs:\s+(.+)\s+xrefs/xrefs/i ) {
538
 
                            # download screwed up and ncbi didn't put acc in for gi numbers
539
 
                            my $i = 0;
540
 
                            for my $id ( split(/\,\s+/,$1) ) {
541
 
                                my ($acc,$db);
542
 
                                if( $id =~ /gi:\s+(\d+)/ ) {
543
 
                                    $acc= $1;
544
 
                                    $db = ($i++ % 2 ) ? 'GenPept' : 'GenBank';
545
 
                                } elsif( $id =~ /pdb\s+accession\s+(\S+)/ ) {
546
 
                                    $acc= $1;
547
 
                                    $db = 'PDB';
548
 
                                } else {
549
 
                                    $acc= $id;
550
 
                                    $db = '';
551
 
                                }
552
 
                                $annotation->add_Annotation
553
 
                                    ('dblink',
554
 
                                     Bio::Annotation::DBLink->new
555
 
                                     (-primary_id => $acc,
556
 
                                      -database => $db,
557
 
                                      -tagname => 'dblink'));
558
 
                            }
559
 
                        } else {
560
 
                            $self->debug("Cannot match $dbsource\n");
561
 
                        }
562
 
                        if( $dbsource =~ s/xrefs\s+\(non\-sequence\s+databases\):\s+
563
 
                            ((?:\S+,\s+)+\S+)//x ) {
564
 
                            for my $id ( split(/\,\s+/,$1) ) {
565
 
                                my $db;
566
 
                                # this is because GenBank dropped the spaces!!!
567
 
                                # I'm sure we're not going to get this right
568
 
                                ##if( $id =~ s/^://i ) {
569
 
                                ##    $db = $1;
570
 
                                ##}
571
 
                                $db = substr($id,0,index($id,':'));
572
 
                                if (! exists $DBSOURCE{ $db }) {
573
 
                                      $db = '';   # do we want 'GenBank' here?
574
 
                                }
575
 
                                $id = substr($id,index($id,':')+1);
576
 
                                $annotation->add_Annotation
577
 
                                    ('dblink',
578
 
                                     Bio::Annotation::DBLink->new
579
 
                                     (-primary_id => $id,
580
 
                                      -database => $db,
581
 
                                      -tagname => 'dblink'));
582
 
                            }
583
 
                        }
584
 
 
585
 
                    } else {
586
 
                if( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)\.(\d+)/ ) {
587
 
                    my ($db,$id,$version) = ($1,$2,$3);
588
 
                    $annotation->add_Annotation
589
 
                    ('dblink',
590
 
                     Bio::Annotation::DBLink->new
591
 
                     (-primary_id => $id,
592
 
                      -version => $version,
593
 
                      -database => $db || 'GenBank',
594
 
                      -tagname => 'dblink'));
595
 
                } elsif ( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)/ ) {
596
 
                    my ($db,$id) = ($1,$2);
597
 
                    $annotation->add_Annotation
598
 
                        ('dblink',
599
 
                         Bio::Annotation::DBLink->new
600
 
                         (-primary_id => $id,
601
 
                          -database => $db || 'GenBank',
602
 
                          -tagname => 'dblink'));
603
 
                } elsif ( $dbsource =~ /(\S+)([\.:])\s*(\S+)/ ) {
604
 
                    my ($db, $version);
605
 
                    my @ids = ();
606
 
                    if ($2 eq ':') {
607
 
                        $db = $1;
608
 
                        # Genbank 192 release notes say this: "The second field can consist of
609
 
                        #     multiple comma-separated identifiers, if a sequence record has
610
 
                        #     multiple DBLINK cross-references of a given type."
611
 
                        #     For example: DBLINK      Project:100,200,300"
612
 
                        @ids = split (/,/, $3);
613
 
                    } else {
614
 
                        ($db, $version) = ('GenBank', $3);
615
 
                        $ids[0] = $1;
616
 
                    }
617
 
                    
618
 
                    foreach my $id (@ids) {
619
 
                        $annotation->add_Annotation('dblink',
 
267
        my $buffer;
 
268
        my ( @acc,        @features );
 
269
        my ( $display_id, $annotation );
 
270
        my $species;
 
271
 
 
272
        # initialize; we may come here because of starting over
 
273
        @features   = ();
 
274
        $annotation = undef;
 
275
        @acc        = ();
 
276
        $species    = undef;
 
277
        %params     = ( -verbose => $self->verbose );    # reset hash
 
278
        local ($/) = "\n";
 
279
        while ( defined( $buffer = $self->_readline() ) ) {
 
280
            last if index( $buffer, 'LOCUS       ' ) == 0;
 
281
        }
 
282
        return unless defined $buffer;                   # end of file
 
283
        $buffer =~ /^LOCUS\s+(\S.*)$/o
 
284
          || $self->throw(
 
285
"GenBank stream with bad LOCUS line. Not GenBank in my book. Got '$buffer'"
 
286
          );
 
287
 
 
288
        my @tokens = split( ' ', $1 );
 
289
 
 
290
        # this is important to have the id for display in e.g. FTHelper,
 
291
        # otherwise you won't know which entry caused an error
 
292
        $display_id = shift(@tokens);
 
293
        $params{'-display_id'} = $display_id;
 
294
 
 
295
        # may still be useful if we don't want the seq
 
296
        my $seqlength = shift(@tokens);
 
297
        if ( exists $VALID_ALPHABET{$seqlength} ) {
 
298
 
 
299
            # moved one token too far.  No locus name?
 
300
            $self->warn(
 
301
"Bad LOCUS name?  Changing [$params{'-display_id'}] to 'unknown' and length to $display_id"
 
302
            );
 
303
            $params{'-display_id'} = 'unknown';
 
304
            $params{'-length'}     = $display_id;
 
305
 
 
306
            # add token back...
 
307
            unshift @tokens, $seqlength;
 
308
        }
 
309
        else {
 
310
            $params{'-length'} = $seqlength;
 
311
        }
 
312
 
 
313
# the alphabet of the entry
 
314
# shouldn't assign alphabet unless one is specifically designated (such as for rc files)
 
315
        my $alphabet = lc( shift @tokens );
 
316
        $params{'-alphabet'} =
 
317
          ( exists $VALID_ALPHABET{$alphabet} )
 
318
          ? $VALID_ALPHABET{$alphabet}
 
319
          : $self->warn("Unknown alphabet: $alphabet");
 
320
 
 
321
        # for aa there is usually no 'molecule' (mRNA etc)
 
322
        if ( $params{'-alphabet'} eq 'protein' ) {
 
323
            $params{'-molecule'} = 'PRT';
 
324
        }
 
325
        else {
 
326
            $params{'-molecule'} = shift(@tokens);
 
327
        }
 
328
 
 
329
        # take care of lower case issues
 
330
        if ( $params{'-molecule'} eq 'dna' || $params{'-molecule'} eq 'rna' ) {
 
331
            $params{'-molecule'} = uc $params{'-molecule'};
 
332
        }
 
333
        $self->debug( "Unrecognized molecule type:" . $params{'-molecule'} )
 
334
          if !exists( $VALID_MOLTYPE{ $params{'-molecule'} } );
 
335
        my $circ = shift(@tokens);
 
336
        if ( $circ eq 'circular' ) {
 
337
            $params{'-is_circular'} = 1;
 
338
            $params{'-division'}    = shift(@tokens);
 
339
        }
 
340
        else {
 
341
            # 'linear' or 'circular' may actually be omitted altogether
 
342
            $params{'-division'} =
 
343
              ( CORE::length($circ) == 3 ) ? $circ : shift(@tokens);
 
344
        }
 
345
        my $date = join( ' ', @tokens );    # we lump together the rest
 
346
 
 
347
        # this is per request bug #1513
 
348
        # we can handle
 
349
        # 9-10-2003
 
350
        # 9-10-03
 
351
        # 09-10-2003
 
352
        # 09-10-03
 
353
        if ( $date =~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/ ) {
 
354
            if ( length($date) < 11 ) {
 
355
 
 
356
                # improperly formatted date
 
357
                # But we'll be nice and fix it for them
 
358
                my ( $d, $m, $y ) = ( $2, $3, $4 );
 
359
                if ( length($d) == 1 ) {
 
360
                    $d = "0$d";
 
361
                }
 
362
 
 
363
                # guess the century here
 
364
                if ( length($y) == 2 ) {
 
365
                    if ( $y > 60 ) {    # arbitrarily guess that '60' means 1960
 
366
                        $y = "19$y";
 
367
                    }
 
368
                    else {
 
369
                        $y = "20$y";
 
370
                    }
 
371
                    $self->warn(
 
372
"Date was malformed, guessing the century for $date to be $y\n"
 
373
                    );
 
374
                }
 
375
                $params{'-dates'} = [ join( '-', $d, $m, $y ) ];
 
376
            }
 
377
            else {
 
378
                $params{'-dates'} = [$date];
 
379
            }
 
380
        }
 
381
 
 
382
        # set them all at once
 
383
        $builder->add_slot_value(%params);
 
384
        %params = ();
 
385
 
 
386
        # parse the rest if desired, otherwise start over
 
387
        if ( !$builder->want_object() ) {
 
388
            $builder->make_object();
 
389
            next RECORDSTART;
 
390
        }
 
391
 
 
392
        # set up annotation depending on what the builder wants
 
393
        if ( $builder->want_slot('annotation') ) {
 
394
            $annotation = Bio::Annotation::Collection->new();
 
395
        }
 
396
        $buffer = $self->_readline();
 
397
        until ( !defined($buffer) ) {
 
398
            $_ = $buffer;
 
399
 
 
400
            # Description line(s)
 
401
            if (/^DEFINITION\s+(\S.*\S)/) {
 
402
                my @desc = ($1);
 
403
                while ( defined( $_ = $self->_readline ) ) {
 
404
                    if (/^\s+(.*)/) { push( @desc, $1 ); next }
 
405
                    last;
 
406
                }
 
407
                $builder->add_slot_value( -desc => join( ' ', @desc ) );
 
408
 
 
409
                # we'll continue right here because DEFINITION always comes
 
410
                # at the top of the entry
 
411
                $buffer = $_;
 
412
            }
 
413
 
 
414
            # accession number (there can be multiple accessions)
 
415
            if (/^ACCESSION\s+(\S.*\S)/) {
 
416
                push( @acc, split( /\s+/, $1 ) );
 
417
                while ( defined( $_ = $self->_readline ) ) {
 
418
                    /^\s+(.*)/ && do { push( @acc, split( /\s+/, $1 ) ); next };
 
419
                    last;
 
420
                }
 
421
                $buffer = $_;
 
422
                next;
 
423
            }
 
424
 
 
425
            # PID
 
426
            elsif (/^PID\s+(\S+)/) {
 
427
                $params{'-pid'} = $1;
 
428
            }
 
429
 
 
430
            # Version number
 
431
            elsif (/^VERSION\s+(\S.+)$/) {
 
432
                my ( $acc, $gi ) = split( ' ', $1 );
 
433
                if ( $acc =~ /^\w+\.(\d+)/ ) {
 
434
                    $params{'-version'}     = $1;
 
435
                    $params{'-seq_version'} = $1;
 
436
                }
 
437
                if ( $gi && ( index( $gi, "GI:" ) == 0 ) ) {
 
438
                    $params{'-primary_id'} = substr( $gi, 3 );
 
439
                }
 
440
            }
 
441
 
 
442
            # Keywords
 
443
            elsif (/^KEYWORDS\s+(\S.*)/) {
 
444
                my @kw = split( /\s*\;\s*/, $1 );
 
445
                while ( defined( $_ = $self->_readline ) ) {
 
446
                    chomp;
 
447
                    /^\s+(.*)/
 
448
                      && do { push( @kw, split( /\s*\;\s*/, $1 ) ); next };
 
449
                    last;
 
450
                }
 
451
 
 
452
                @kw && $kw[-1] =~ s/\.$//;
 
453
                $params{'-keywords'} = \@kw;
 
454
                $buffer = $_;
 
455
                next;
 
456
            }
 
457
 
 
458
            # Organism name and phylogenetic information
 
459
            elsif (/^SOURCE\s+\S/) {
 
460
                if ( $builder->want_slot('species') ) {
 
461
                    $species = $self->_read_GenBank_Species( \$buffer );
 
462
                    $builder->add_slot_value( -species => $species );
 
463
                }
 
464
                else {
 
465
                    while ( defined( $buffer = $self->_readline() ) ) {
 
466
                        last if substr( $buffer, 0, 1 ) ne ' ';
 
467
                    }
 
468
                }
 
469
                next;
 
470
            }
 
471
 
 
472
            # References
 
473
            elsif (/^REFERENCE\s+\S/) {
 
474
                if ($annotation) {
 
475
                    my @refs = $self->_read_GenBank_References( \$buffer );
 
476
                    foreach my $ref (@refs) {
 
477
                        $annotation->add_Annotation( 'reference', $ref );
 
478
                    }
 
479
                }
 
480
                else {
 
481
                    while ( defined( $buffer = $self->_readline() ) ) {
 
482
                        last if substr( $buffer, 0, 1 ) ne ' ';
 
483
                    }
 
484
                }
 
485
                next;
 
486
            }
 
487
 
 
488
            # Project
 
489
            elsif (/^PROJECT\s+(\S.*)/) {
 
490
                if ($annotation) {
 
491
                    my $project =
 
492
                      Bio::Annotation::SimpleValue->new( -value => $1 );
 
493
                    $annotation->add_Annotation( 'project', $project );
 
494
                }
 
495
            }
 
496
 
 
497
            # Comments
 
498
            elsif (/^COMMENT\s+(\S.*)/) {
 
499
                if ($annotation) {
 
500
                    my $comment = $1;
 
501
                    while ( defined( $_ = $self->_readline ) ) {
 
502
                        last if (/^\S/);
 
503
                        $comment .= $_;
 
504
                    }
 
505
                    $comment =~ s/\n/ /g;
 
506
                    $comment =~ s/  +/ /g;
 
507
                    $annotation->add_Annotation(
 
508
                        'comment',
 
509
                        Bio::Annotation::Comment->new(
 
510
                            -text    => $comment,
 
511
                            -tagname => 'comment'
 
512
                        )
 
513
                    );
 
514
                    $buffer = $_;
 
515
                }
 
516
                else {
 
517
                    while ( defined( $buffer = $self->_readline() ) ) {
 
518
                        last if substr( $buffer, 0, 1 ) ne ' ';
 
519
                    }
 
520
                }
 
521
                next;
 
522
            }
 
523
 
 
524
            # Corresponding Genbank nucleotide id, Genpept only
 
525
            elsif (/^DB(?:SOURCE|LINK)\s+(\S.+)/) {
 
526
                if ($annotation) {
 
527
                    my $dbsource = $1;
 
528
                    while ( defined( $_ = $self->_readline ) ) {
 
529
                        last if (/^\S/);
 
530
                        $dbsource .= $_;
 
531
                    }
 
532
 
 
533
                    # deal with UniProKB dbsources
 
534
                    if ( $dbsource =~
 
535
                        s/(UniProt(?:KB)?|swissprot):\s+locus\s+(\S+)\,.+\n// )
 
536
                    {
 
537
                        $annotation->add_Annotation(
 
538
                            'dblink',
620
539
                            Bio::Annotation::DBLink->new(
621
 
                               -primary_id => $id,
622
 
                               -version => $version,
623
 
                               -database => $db,
624
 
                               -tagname => 'dblink')
 
540
                                -primary_id => $2,
 
541
                                -database   => $1,
 
542
                                -tagname    => 'dblink'
 
543
                            )
625
544
                        );
626
 
                    }
627
 
                } else {
628
 
                    $self->warn("Unrecognized DBSOURCE data: $dbsource\n");
629
 
                }
630
 
                    }
631
 
 
632
 
                    $buffer = $_;
633
 
                } else {
634
 
                    while(defined($buffer = $self->_readline())) {
635
 
                        last if substr($buffer,0,1) ne ' ';
636
 
                    }
637
 
                }
638
 
                next;
639
 
            }
640
 
            # Exit at start of Feature table, or start of sequence
641
 
            last if( /^(FEATURES|ORIGIN)/ );
642
 
            # Get next line and loop again
643
 
            $buffer = $self->_readline;
644
 
        }
645
 
        return unless defined $buffer;
646
 
 
647
 
        # add them all at once for efficiency
648
 
        $builder->add_slot_value(-accession_number => shift(@acc),
649
 
                                 -secondary_accessions => \@acc,
650
 
                                 %params);
651
 
        $builder->add_slot_value(-annotation => $annotation) if $annotation;
652
 
        %params = (); # reset before possible re-use to avoid setting twice
653
 
 
654
 
        # start over if we don't want to continue with this entry
655
 
        if(! $builder->want_object()) {
656
 
            $builder->make_object();
657
 
            next RECORDSTART;
658
 
        }
659
 
        # some "minimal" formats may not necessarily have a feature table
660
 
        if($builder->want_slot('features') && defined($_) && /^FEATURES/o) {
661
 
            # need to read the first line of the feature table
662
 
            $buffer = $self->_readline;
663
 
            # DO NOT read lines in the while condition -- this is done as a side
664
 
            # effect in _read_FTHelper_GenBank!
665
 
 
666
 
#           part of new circular spec: 
667
 
#           commented out for now until kinks worked out
668
 
            #my $sourceEnd = 0;
669
 
            #$sourceEnd = $2 if ($buffer =~ /(\d+?)\.\.(\d+?)$/);
670
 
 
671
 
            while( defined($buffer) ) {
672
 
                                # check immediately -- not at the end of the loop
673
 
                                # note: GenPept entries obviously do not have a BASE line
674
 
                last if( $buffer =~ /^BASE|ORIGIN|CONTIG|WGS/o);
675
 
 
676
 
                                # slurp in one feature at a time -- at return, the start of
677
 
                                # the next feature will have been read already, so we need
678
 
                                # to pass a reference, and the called method must set this
679
 
                                # to the last line read before returning
680
 
 
681
 
                my $ftunit = $self->_read_FTHelper_GenBank(\$buffer);
682
 
 
683
 
#               implement new circular spec: features that cross the origin are now
684
 
#               seamless instead of being 2 separate joined features
685
 
#               commented out until kinks get worked out
686
 
                #if ((! $args{'-nojoin'}) && $ftunit->{'loc'} =~ /^join\((\d+?)\.\.(\d+?),(\d+?)..(\d+?)\)$/ 
687
 
                #&& $sourceEnd == $2 && $3 == 1) {
688
 
                        #my $start = $1;
689
 
                        #my $end = $2 + $4;
690
 
                        #$ftunit->{'loc'} = "$start..$end";
691
 
                #}
692
 
 
693
 
                                # fix suggested by James Diggans
694
 
 
695
 
                if( !defined $ftunit ) {
696
 
                    # GRRRR. We have fallen over. Try to recover
697
 
                    $self->warn("Unexpected error in feature table for ".$params{'-display_id'}." Skipping feature, attempting to recover");
698
 
                    unless( ($buffer =~ /^\s{5,5}\S+/o) or
699
 
                            ($buffer =~ /^\S+/o)) {
700
 
                        $buffer = $self->_readline();
701
 
                    }
702
 
                    next;       # back to reading FTHelpers
703
 
                }
704
 
 
705
 
                                # process ftunit
706
 
                my $feat =
707
 
                    $ftunit->_generic_seqfeature($self->location_factory(),
708
 
                                                 $display_id);
709
 
                                # add taxon_id from source if available
710
 
                if($species && ($feat->primary_tag eq 'source') &&
711
 
                   $feat->has_tag('db_xref') && (! $species->ncbi_taxid() ||
712
 
                                                 ($species->ncbi_taxid && $species->ncbi_taxid =~ /^list/))) {
713
 
                    foreach my $tagval ($feat->get_tag_values('db_xref')) {
714
 
                        if(index($tagval,"taxon:") == 0) {
715
 
                            $species->ncbi_taxid(substr($tagval,6));
716
 
                            last;
717
 
                        }
718
 
                    }
719
 
                }
720
 
                                # add feature to list of features
721
 
                push(@features, $feat);
722
 
            }
723
 
            $builder->add_slot_value(-features => \@features);
724
 
            $_ = $buffer;
725
 
        }
726
 
        if( defined ($_) ) {
727
 
            if( /^CONTIG/o ) {
728
 
                my @contig;
729
 
                my $ctg = '';
730
 
                while($_ !~ m{^//}) { # end of file
731
 
                    $_ =~ /^(?:CONTIG)?\s+(.*)/;
732
 
                    $ctg .= $1;
733
 
                    $_ = $self->_readline;
734
 
                }
735
 
                if ($ctg) {
736
 
                    $annotation->add_Annotation( 
737
 
                        Bio::Annotation::SimpleValue->new(-tagname => 'contig',
738
 
                                                          -value => $ctg )
739
 
                        );
740
 
                }
741
 
                $self->_pushback($_);
742
 
            } elsif( /^WGS|WGS_SCAFLD\s+/o ) { # catch WGS/WGS_SCAFLD lines
743
 
                while($_ =~ s/(^WGS|WGS_SCAFLD)\s+//){ # gulp lines
744
 
                    chomp;
745
 
                    $annotation->add_Annotation(
746
 
                                                Bio::Annotation::SimpleValue->new(-value => $_,
747
 
                                                                                  -tagname => lc($1)));
748
 
                    $_ = $self->_readline;
749
 
                }
750
 
            } elsif(! m{^(ORIGIN|//)} ) { # advance to the sequence, if any
751
 
                while (defined( $_ = $self->_readline) ) {
752
 
                    last if m{^(ORIGIN|//)};
753
 
                }
754
 
            }
755
 
        }
756
 
        if(! $builder->want_object()) {
757
 
            $builder->make_object(); # implicit end-of-object
758
 
            next RECORDSTART;
759
 
        }
760
 
        if($builder->want_slot('seq')) {
761
 
            # the fact that we want a sequence does not necessarily mean that
762
 
            # there also is a sequence ...
763
 
            if(defined($_) && s/^ORIGIN\s+//) {
764
 
                chomp;
765
 
                if( $annotation && length($_) > 0 ) {
766
 
                    $annotation->add_Annotation('origin',
767
 
                                                Bio::Annotation::SimpleValue->new(-tagname => 'origin',
768
 
                                                                                  -value => $_));
769
 
                }
770
 
                my $seqc = '';
771
 
                while( defined($_ = $self->_readline) ) {
772
 
                    m{^//} && last;
773
 
                    $_ = uc($_);
774
 
                    s/[^A-Za-z]//g;
775
 
                    $seqc .= $_;
776
 
                }
777
 
                #$self->debug("sequence length is ". length($seqc) ."\n");
778
 
                $builder->add_slot_value(-seq => $seqc);
779
 
            }
780
 
        } elsif ( defined($_) && (substr($_,0,2) ne '//')) {
781
 
            # advance to the end of the record
782
 
            while( defined($_ = $self->_readline) ) {
783
 
                last if substr($_,0,2) eq '//';
784
 
            }
785
 
        }
786
 
        # Unlikely, but maybe the sequence is so weird that we don't want it
787
 
        # anymore. We don't want to return undef if the stream's not exhausted
788
 
        # yet.
789
 
        $seq = $builder->make_object();
790
 
        next RECORDSTART unless $seq;
791
 
        last RECORDSTART;
792
 
    }                           # end while RECORDSTART
 
545
                        if ( $dbsource =~ s/\s+created:\s+([^\.]+)\.\n// ) {
 
546
                            $annotation->add_Annotation(
 
547
                                'swissprot_dates',
 
548
                                Bio::Annotation::SimpleValue->new(
 
549
                                    -tagname => 'date_created',
 
550
                                    -value   => $1
 
551
                                )
 
552
                            );
 
553
                        }
 
554
                        while ( $dbsource =~
 
555
s/\s+(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g
 
556
                          )
 
557
                        {
 
558
                            $annotation->add_Annotation(
 
559
                                'swissprot_dates',
 
560
                                Bio::Annotation::SimpleValue->new(
 
561
                                    -tagname => 'date_updated',
 
562
                                    -value   => $2
 
563
                                )
 
564
                            );
 
565
                        }
 
566
                        $dbsource =~ s/\n/ /g;
 
567
                        if ( $dbsource =~
 
568
                            s/\s+xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ )
 
569
                        {
 
570
                            # will use $i to determine even or odd
 
571
                            # for swissprot the accessions are paired
 
572
                            my $i = 0;
 
573
                            for my $dbsrc ( split( /,\s+/, $1 ) ) {
 
574
                                if (   $dbsrc =~ /(\S+)\.(\d+)/
 
575
                                    || $dbsrc =~ /(\S+)/ )
 
576
                                {
 
577
                                    my ( $id, $version ) = ( $1, $2 );
 
578
                                    $version = '' unless defined $version;
 
579
                                    my $db;
 
580
                                    if ( $id =~ /^\d\S{3}/ ) {
 
581
                                        $db = 'PDB';
 
582
                                    }
 
583
                                    else {
 
584
                                        $db =
 
585
                                          ( $i++ % 2 ) ? 'GenPept' : 'GenBank';
 
586
                                    }
 
587
                                    $annotation->add_Annotation(
 
588
                                        'dblink',
 
589
                                        Bio::Annotation::DBLink->new(
 
590
                                            -primary_id => $id,
 
591
                                            -version    => $version,
 
592
                                            -database   => $db,
 
593
                                            -tagname    => 'dblink'
 
594
                                        )
 
595
                                    );
 
596
                                }
 
597
                            }
 
598
                        }
 
599
                        elsif (
 
600
                            $dbsource =~ s/\s+xrefs:\s+(.+)\s+xrefs/xrefs/i )
 
601
                        {
 
602
                 # download screwed up and ncbi didn't put acc in for gi numbers
 
603
                            my $i = 0;
 
604
                            for my $id ( split( /\,\s+/, $1 ) ) {
 
605
                                my ( $acc, $db );
 
606
                                if ( $id =~ /gi:\s+(\d+)/ ) {
 
607
                                    $acc = $1;
 
608
                                    $db = ( $i++ % 2 ) ? 'GenPept' : 'GenBank';
 
609
                                }
 
610
                                elsif ( $id =~ /pdb\s+accession\s+(\S+)/ ) {
 
611
                                    $acc = $1;
 
612
                                    $db  = 'PDB';
 
613
                                }
 
614
                                else {
 
615
                                    $acc = $id;
 
616
                                    $db  = '';
 
617
                                }
 
618
                                $annotation->add_Annotation(
 
619
                                    'dblink',
 
620
                                    Bio::Annotation::DBLink->new(
 
621
                                        -primary_id => $acc,
 
622
                                        -database   => $db,
 
623
                                        -tagname    => 'dblink'
 
624
                                    )
 
625
                                );
 
626
                            }
 
627
                        }
 
628
                        else {
 
629
                            $self->debug("Cannot match $dbsource\n");
 
630
                        }
 
631
                        if (
 
632
                            $dbsource =~
 
633
                            s/xrefs\s+\(non\-sequence\s+databases\):\s+
 
634
                ((?:\S+,\s+)+\S+)//x
 
635
                          )
 
636
                        {
 
637
                            for my $id ( split( /\,\s+/, $1 ) ) {
 
638
                                my $db;
 
639
 
 
640
                                # this is because GenBank dropped the spaces!!!
 
641
                                # I'm sure we're not going to get this right
 
642
                                ##if( $id =~ s/^://i ) {
 
643
                                ##    $db = $1;
 
644
                                ##}
 
645
                                $db = substr( $id, 0, index( $id, ':' ) );
 
646
                                if ( !exists $DBSOURCE{$db} ) {
 
647
                                    $db = '';    # do we want 'GenBank' here?
 
648
                                }
 
649
                                $id = substr( $id, index( $id, ':' ) + 1 );
 
650
                                $annotation->add_Annotation(
 
651
                                    'dblink',
 
652
                                    Bio::Annotation::DBLink->new(
 
653
                                        -primary_id => $id,
 
654
                                        -database   => $db,
 
655
                                        -tagname    => 'dblink'
 
656
                                    )
 
657
                                );
 
658
                            }
 
659
                        }
 
660
 
 
661
                    }
 
662
                    else {
 
663
                        if ( $dbsource =~
 
664
                            /^(\S*?):?\s*accession\s+(\S+)\.(\d+)/ )
 
665
                        {
 
666
                            my ( $db, $id, $version ) = ( $1, $2, $3 );
 
667
                            $annotation->add_Annotation(
 
668
                                'dblink',
 
669
                                Bio::Annotation::DBLink->new(
 
670
                                    -primary_id => $id,
 
671
                                    -version    => $version,
 
672
                                    -database   => $db || 'GenBank',
 
673
                                    -tagname    => 'dblink'
 
674
                                )
 
675
                            );
 
676
                        }
 
677
                        elsif ( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)/ ) {
 
678
                            my ( $db, $id ) = ( $1, $2 );
 
679
                            $annotation->add_Annotation(
 
680
                                'dblink',
 
681
                                Bio::Annotation::DBLink->new(
 
682
                                    -primary_id => $id,
 
683
                                    -database   => $db || 'GenBank',
 
684
                                    -tagname    => 'dblink'
 
685
                                )
 
686
                            );
 
687
                        }
 
688
                        elsif ( $dbsource =~ /(\S+)([\.:])\s*(\S+)/ ) {
 
689
                            my ( $db, $version );
 
690
                            my @ids = ();
 
691
                            if ( $2 eq ':' ) {
 
692
                                $db = $1;
 
693
 
 
694
          # Genbank 192 release notes say this: "The second field can consist of
 
695
          #     multiple comma-separated identifiers, if a sequence record has
 
696
          #     multiple DBLINK cross-references of a given type."
 
697
          #     For example: DBLINK      Project:100,200,300"
 
698
                                @ids = split( /,/, $3 );
 
699
                            }
 
700
                            else {
 
701
                                ( $db, $version ) = ( 'GenBank', $3 );
 
702
                                $ids[0] = $1;
 
703
                            }
 
704
 
 
705
                            foreach my $id (@ids) {
 
706
                                $annotation->add_Annotation(
 
707
                                    'dblink',
 
708
                                    Bio::Annotation::DBLink->new(
 
709
                                        -primary_id => $id,
 
710
                                        -version    => $version,
 
711
                                        -database   => $db,
 
712
                                        -tagname    => 'dblink'
 
713
                                    )
 
714
                                );
 
715
                            }
 
716
                        }
 
717
                        else {
 
718
                            $self->warn(
 
719
                                "Unrecognized DBSOURCE data: $dbsource\n");
 
720
                        }
 
721
                    }
 
722
 
 
723
                    $buffer = $_;
 
724
                }
 
725
                else {
 
726
                    while ( defined( $buffer = $self->_readline() ) ) {
 
727
                        last if substr( $buffer, 0, 1 ) ne ' ';
 
728
                    }
 
729
                }
 
730
                next;
 
731
            }
 
732
 
 
733
            # Exit at start of Feature table, or start of sequence
 
734
            last if (/^(FEATURES|ORIGIN)/);
 
735
 
 
736
            # Get next line and loop again
 
737
            $buffer = $self->_readline;
 
738
        }
 
739
        return unless defined $buffer;
 
740
 
 
741
        # add them all at once for efficiency
 
742
        $builder->add_slot_value(
 
743
            -accession_number     => shift(@acc),
 
744
            -secondary_accessions => \@acc,
 
745
            %params
 
746
        );
 
747
        $builder->add_slot_value( -annotation => $annotation ) if $annotation;
 
748
        %params = ();    # reset before possible re-use to avoid setting twice
 
749
 
 
750
        # start over if we don't want to continue with this entry
 
751
        if ( !$builder->want_object() ) {
 
752
            $builder->make_object();
 
753
            next RECORDSTART;
 
754
        }
 
755
 
 
756
        # some "minimal" formats may not necessarily have a feature table
 
757
        if ( $builder->want_slot('features') && defined($_) && /^FEATURES/o ) {
 
758
 
 
759
            # need to read the first line of the feature table
 
760
            $buffer = $self->_readline;
 
761
 
 
762
            # DO NOT read lines in the while condition -- this is done as a side
 
763
            # effect in _read_FTHelper_GenBank!
 
764
 
 
765
            #       part of new circular spec:
 
766
            #       commented out for now until kinks worked out
 
767
            #my $sourceEnd = 0;
 
768
            #$sourceEnd = $2 if ($buffer =~ /(\d+?)\.\.(\d+?)$/);
 
769
 
 
770
            while ( defined($buffer) ) {
 
771
 
 
772
                # check immediately -- not at the end of the loop
 
773
                # note: GenPept entries obviously do not have a BASE line
 
774
                last if ( $buffer =~ /^BASE|ORIGIN|CONTIG|WGS/o );
 
775
 
 
776
                # slurp in one feature at a time -- at return, the start of
 
777
                # the next feature will have been read already, so we need
 
778
                # to pass a reference, and the called method must set this
 
779
                # to the last line read before returning
 
780
 
 
781
                my $ftunit = $self->_read_FTHelper_GenBank( \$buffer );
 
782
 
 
783
                #               implement new circular spec: features that cross the origin are now
 
784
                #               seamless instead of being 2 separate joined features
 
785
                #               commented out until kinks get worked out
 
786
                #if ((! $args{'-nojoin'}) && $ftunit->{'loc'} =~ /^join\((\d+?)\.\.(\d+?),(\d+?)..(\d+?)\)$/
 
787
                #&& $sourceEnd == $2 && $3 == 1) {
 
788
                #my $start = $1;
 
789
                #my $end = $2 + $4;
 
790
                #$ftunit->{'loc'} = "$start..$end";
 
791
                #}
 
792
 
 
793
                # fix suggested by James Diggans
 
794
 
 
795
                if ( !defined $ftunit ) {
 
796
 
 
797
                    # GRRRR. We have fallen over. Try to recover
 
798
                    $self->warn( "Unexpected error in feature table for "
 
799
                          . $params{'-display_id'}
 
800
                          . " Skipping feature, attempting to recover" );
 
801
                    unless ( ( $buffer =~ /^\s{5,5}\S+/o )
 
802
                        or ( $buffer =~ /^\S+/o ) )
 
803
                    {
 
804
                        $buffer = $self->_readline();
 
805
                    }
 
806
                    next;    # back to reading FTHelpers
 
807
                }
 
808
 
 
809
                # process ftunit
 
810
                my $feat =
 
811
                  $ftunit->_generic_seqfeature( $self->location_factory(),
 
812
                    $display_id );
 
813
 
 
814
                # add taxon_id from source if available
 
815
                if (
 
816
                       $species
 
817
                    && ( $feat->primary_tag eq 'source' )
 
818
                    && $feat->has_tag('db_xref')
 
819
                    && (
 
820
                        !$species->ncbi_taxid()
 
821
                        || (   $species->ncbi_taxid
 
822
                            && $species->ncbi_taxid =~ /^list/ )
 
823
                    )
 
824
                  )
 
825
                {
 
826
                    foreach my $tagval ( $feat->get_tag_values('db_xref') ) {
 
827
                        if ( index( $tagval, "taxon:" ) == 0 ) {
 
828
                            $species->ncbi_taxid( substr( $tagval, 6 ) );
 
829
                            last;
 
830
                        }
 
831
                    }
 
832
                }
 
833
 
 
834
                # add feature to list of features
 
835
                push( @features, $feat );
 
836
            }
 
837
            $builder->add_slot_value( -features => \@features );
 
838
            $_ = $buffer;
 
839
        }
 
840
 
 
841
        if ( defined($_) ) {
 
842
            # CONTIG lines: TODO, this needs to be cleaned up
 
843
            if (/^CONTIG\s+(.*)/o) {
 
844
                my $ctg = $1;
 
845
                while ( defined( $_ = $self->_readline)) {
 
846
                    last if m{^ORIGIN|//}o;
 
847
                    s/\s+(.*)/$1/;
 
848
                    $ctg .= $_;
 
849
                }
 
850
                if ($ctg) {
 
851
                    $annotation->add_Annotation(
 
852
                        Bio::Annotation::SimpleValue->new(
 
853
                            -tagname => 'contig',
 
854
                            -value   => $ctg
 
855
                        )
 
856
                    );
 
857
                }
 
858
            }
 
859
            elsif (/^WGS|WGS_SCAFLD\s+/o) {    # catch WGS/WGS_SCAFLD lines
 
860
                while ( $_ =~ s/(^WGS|WGS_SCAFLD)\s+// ) {    # gulp lines
 
861
                    chomp;
 
862
                    $annotation->add_Annotation(
 
863
                        Bio::Annotation::SimpleValue->new(
 
864
                            -value   => $_,
 
865
                            -tagname => lc($1)
 
866
                        )
 
867
                    );
 
868
                    $_ = $self->_readline;
 
869
                }
 
870
            }
 
871
            elsif ( !m{^ORIGIN|//}o ) {    # advance to the sequence, if any
 
872
                while ( defined( $_ = $self->_readline ) ) {
 
873
                    last if m{^(ORIGIN|//)};
 
874
                }
 
875
            }
 
876
        }
 
877
        if ( !$builder->want_object() ) {
 
878
            $builder->make_object();        # implicit end-of-object
 
879
            next RECORDSTART;
 
880
        }
 
881
        if ( $builder->want_slot('seq') ) {
 
882
            # the fact that we want a sequence does not necessarily mean that
 
883
            # there also is a sequence ...
 
884
            if ( defined($_) && s/^ORIGIN\s+// ) {
 
885
                if ( $annotation && length($_) > 0 ) {
 
886
                    $annotation->add_Annotation(
 
887
                        'origin',
 
888
                        Bio::Annotation::SimpleValue->new(
 
889
                            -tagname => 'origin',
 
890
                            -value   => $_
 
891
                        )
 
892
                    );
 
893
                }
 
894
                my $seqc = '';
 
895
                while ( defined( $_ = $self->_readline ) ) {
 
896
                    last if m{^//};
 
897
                    $_ = uc($_);
 
898
                    s/[^A-Za-z]//g;
 
899
                    $seqc .= $_;
 
900
                }
 
901
 
 
902
                $builder->add_slot_value( -seq => $seqc );
 
903
            }
 
904
        }
 
905
        elsif ( defined($_) && ( substr( $_, 0, 2 ) ne '//' ) ) {
 
906
 
 
907
            # advance to the end of the record
 
908
            while ( defined( $_ = $self->_readline ) ) {
 
909
                last if substr( $_, 0, 2 ) eq '//';
 
910
            }
 
911
        }
 
912
 
 
913
        # Unlikely, but maybe the sequence is so weird that we don't want it
 
914
        # anymore. We don't want to return undef if the stream's not exhausted
 
915
        # yet.
 
916
        $seq = $builder->make_object();
 
917
        next RECORDSTART unless $seq;
 
918
        last RECORDSTART;
 
919
    }    # end while RECORDSTART
793
920
 
794
921
    return $seq;
795
922
}