~ubuntu-branches/ubuntu/trusty/bioperl/trusty

Committer: Package Import Robot
Author(s): Charles Plessy
Date: 2014-01-18 11:41:11 UTC
mfrom: (3.1.12 sid)
Revision ID: package-import@ubuntu.com-20140118114111-zcjaq5edb49dhlat

Tags: 1.6.923-1

* New upstream release.
* Does not need non-free libmath-random-perl anymore.
* Build-depend on libmodule-build-perl (>= 0.420000). Despite Lintian's
warning that it is useless, the package does not build without.
* Conforms to Policy version 3.9.5.

files added:
doc/Deobfuscator/bin/run-deobfuscator-update.pl

t/data/YP_007988852.gp

t/data/bl2seq.tblastn.out

files modified:
.travis.yml

Bio/Align/Utilities.pm

Bio/Assembly/Singlet.pm

Bio/DB/Fasta.pm

Bio/DB/Flat/BDB.pm

Bio/DB/Flat/BDB/fasta.pm

Bio/DB/Flat/BinarySearch.pm

Bio/DB/IndexedBase.pm

Bio/DB/NCBIHelper.pm

Bio/DB/Qual.pm

Bio/DB/Query/GenBank.pm

Bio/DB/SeqFeature/Store/DBI/SQLite.pm

Bio/DB/SeqFeature/Store/LoadHelper.pm

Bio/DB/SeqFeature/Store/Loader.pm

Bio/DB/SeqFeature/Store/berkeleydb.pm

Bio/DB/Taxonomy/entrez.pm

Bio/Factory/FTLocationFactory.pm

Bio/Index/Abstract.pm

Bio/Index/Blast.pm

Bio/Index/BlastTable.pm

Bio/Index/EMBL.pm

Bio/Index/Fasta.pm

Bio/Index/Fastq.pm

Bio/Index/GenBank.pm

Bio/Index/Qual.pm

Bio/Index/Stockholm.pm

Bio/Index/SwissPfam.pm

Bio/Index/Swissprot.pm

Bio/Location/Atomic.pm

Bio/Location/Split.pm

Bio/PhyloNetwork/RandomFactory.pm

Bio/PrimarySeq.pm

Bio/PrimarySeqI.pm

Bio/Root/Build.pm

Bio/Root/Version.pm

Bio/SearchIO/SearchResultEventBuilder.pm

Bio/SearchIO/blast.pm

Bio/SearchIO/blasttable.pm

Bio/SearchIO/hmmer3.pm

Bio/SeqFeature/Generic.pm

Bio/SeqFeatureI.pm

Bio/SeqIO/fasta.pm

Bio/SeqIO/genbank.pm

Bio/SeqIO/interpro.pm

Bio/SeqUtils.pm

Bio/SimpleAlign.pm

Bio/Tools/Alignment/Consed.pm

Bio/Tools/CodonTable.pm

Bio/Tools/IUPAC.pm

Bio/Tools/RandomDistFunctions.pm

Build.PL

Changes

MANIFEST

META.json

META.yml

debian/changelog

debian/control

debian/copyright

doc/Deobfuscator/README

doc/Deobfuscator/cgi-bin/deob_detail.cgi *

doc/Deobfuscator/lib/Deobfuscator.pm *

t/Align/SimpleAlign.t

t/AlignIO/bl2seq.t

t/LocalDB/BioDBGFF.t

t/LocalDB/Fasta.t

t/LocalDB/Index/Index.t

t/LocalDB/SeqFeature.t

t/RemoteDB/EntrezGene.t

t/RemoteDB/MeSH.t

t/RemoteDB/Query/GenBank.t

t/Root/HTTPget.t

t/SearchIO/blast_pull.t

t/Seq/LargePSeq.t

t/Seq/PrimarySeq.t

t/SeqFeature/Generic.t

t/SeqIO/genbank.t

t/SeqIO/interpro.t

t/SeqIO/largefasta.t

t/SeqTools/CodonTable.t

t/SeqTools/SeqUtils.t

t/Tree/TreeIO/nhx.t

Show diffs side-by-side

added added

removed removed

Bio/SeqIO/genbank.pm

256

=cut

257

258

sub next_seq {

259

my ($self,@args) = @_;

260

my %args = @args;

259

my ( $self, @args ) = @_;

260

my %args = @args;

261

my $builder = $self->sequence_builder();

262

my $seq;

263

my %params;

264

265

RECORDSTART:

266

while (1) {

267

my $buffer;

268

my (@acc, @features);

269

my ($display_id, $annotation);

270

my $species;

271

272

# initialize; we may come here because of starting over

273

@features = ();

274

$annotation = undef;

275

@acc = ();

276

$species = undef;

277

%params = (-verbose => $self->verbose); # reset hash

278

local($/) = "\n";

279

while(defined($buffer = $self->_readline())) {

280

last if index($buffer,'LOCUS ') == 0;

281

}

282

return unless defined $buffer; # end of file

283

$buffer =~ /^LOCUS\s+(\S.*)$/o ||

284

$self->throw("GenBank stream with bad LOCUS line. Not GenBank in my book. Got '$buffer'");

285

286

my @tokens = split(' ', $1);

287

288

# this is important to have the id for display in e.g. FTHelper,

289

# otherwise you won't know which entry caused an error

290

$display_id = shift(@tokens);

291

$params{'-display_id'} = $display_id;

292

# may still be useful if we don't want the seq

293

my $seqlength = shift(@tokens);

294

if (exists $VALID_ALPHABET{$seqlength}) {

295

# moved one token too far. No locus name?

296

$self->warn("Bad LOCUS name? Changing [$params{'-display_id'}] to 'unknown' and length to $display_id");

297

$params{'-display_id'} = 'unknown';

298

$params{'-length'} = $display_id;

299

# add token back...

300

unshift @tokens, $seqlength;

301

} else {

302

$params{'-length'} = $seqlength;

303

}

304

# the alphabet of the entry

305

# shouldn't assign alphabet unless one is specifically designated (such as for rc files)

306

my $alphabet = lc(shift @tokens);

307

$params{'-alphabet'} = (exists $VALID_ALPHABET{$alphabet}) ? $VALID_ALPHABET{$alphabet} :

308

$self->warn("Unknown alphabet: $alphabet");

309

# for aa there is usually no 'molecule' (mRNA etc)

310

if ($params{'-alphabet'} eq 'protein') {

311

$params{'-molecule'} = 'PRT'

312

} else {

313

$params{'-molecule'} = shift(@tokens);

314

}

315

# take care of lower case issues

316

if ($params{'-molecule'} eq 'dna' || $params{'-molecule'} eq 'rna') {

317

$params{'-molecule'} = uc $params{'-molecule'};

318

}

319

$self->debug("Unrecognized molecule type:".$params{'-molecule'}) if

320

!exists($VALID_MOLTYPE{$params{'-molecule'}});

321

my $circ = shift(@tokens);

322

if ($circ eq 'circular') {

323

$params{'-is_circular'} = 1;

324

$params{'-division'} = shift(@tokens);

325

} else {

326

# 'linear' or 'circular' may actually be omitted altogether

327

$params{'-division'} =

328

(CORE::length($circ) == 3 ) ? $circ : shift(@tokens);

329

}

330

my $date = join(' ', @tokens); # we lump together the rest

331

332

# this is per request bug #1513

333

# we can handle

334

# 9-10-2003

335

# 9-10-03

336

# 09-10-2003

337

# 09-10-03

338

if($date =~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/) {

339

if( length($date) < 11 ) {

340

# improperly formatted date

341

# But we'll be nice and fix it for them

342

my ($d,$m,$y) = ($2,$3,$4);

343

if( length($d) == 1 ) {

344

$d = "0$d";

345

}

346

# guess the century here

347

if( length($y) == 2 ) {

348

if( $y > 60 ) { # arbitrarily guess that '60' means 1960

349

$y = "19$y";

350

} else {

351

$y = "20$y";

352

}

353

$self->warn("Date was malformed, guessing the century for $date to be $y\n");

354

}

355

$params{'-dates'} = [join('-',$d,$m,$y)];

356

} else {

357

$params{'-dates'} = [$date];

358

}

359

}

360

# set them all at once

361

$builder->add_slot_value(%params);

362

%params = ();

363

364

# parse the rest if desired, otherwise start over

365

if(! $builder->want_object()) {

366

$builder->make_object();

367

next RECORDSTART;

368

}

369

370

# set up annotation depending on what the builder wants

371

if($builder->want_slot('annotation')) {

372

$annotation = Bio::Annotation::Collection->new();

373

}

374

$buffer = $self->_readline();

375

until( !defined ($buffer) ) {

376

$_ = $buffer;

377

# Description line(s)

378

if (/^DEFINITION\s+(\S.*\S)/) {

379

my @desc = ($1);

380

while ( defined($_ = $self->_readline) ) {

381

if( /^\s+(.*)/ ) { push (@desc, $1); next };

382

last;

383

}

384

$builder->add_slot_value(-desc => join(' ', @desc));

385

# we'll continue right here because DEFINITION always comes

386

# at the top of the entry

387

$buffer= $_;

388

}

389

# accession number (there can be multiple accessions)

390

if( /^ACCESSION\s+(\S.*\S)/ ) {

391

push(@acc, split(/\s+/,$1));

392

while( defined($_ = $self->_readline) ) {

393

/^\s+(.*)/ && do { push (@acc, split(/\s+/,$1)); next };

394

last;

395

}

396

$buffer = $_;

397

next;

398

}

399

# PID

400

elsif( /^PID\s+(\S+)/ ) {

401

$params{'-pid'} = $1;

402

}

403

# Version number

404

elsif( /^VERSION\s+(\S.+)$/ ) {

405

my ($acc,$gi) = split(' ',$1);

406

if($acc =~ /^\w+\.(\d+)/) {

407

$params{'-version'} = $1;

408

$params{'-seq_version'} = $1;

409

}

410

if($gi && (index($gi,"GI:") == 0)) {

411

$params{'-primary_id'} = substr($gi,3);

412

}

413

}

414

# Keywords

415

elsif( /^KEYWORDS\s+(\S.*)/ ) {

416

my @kw = split(/\s*\;\s*/,$1);

417

while( defined($_ = $self->_readline) ) {

418

chomp;

419

/^\s+(.*)/ && do { push (@kw, split(/\s*\;\s*/,$1)); next };

420

last;

421

}

422

423

@kw && $kw[-1] =~ s/\.$//;

424

$params{'-keywords'} = \@kw;

425

$buffer = $_;

426

next;

427

}

428

# Organism name and phylogenetic information

429

elsif (/^SOURCE\s+\S/) {

430

if($builder->want_slot('species')) {

431

$species = $self->_read_GenBank_Species(\$buffer);

432

$builder->add_slot_value(-species => $species);

433

} else {

434

while(defined($buffer = $self->_readline())) {

435

last if substr($buffer,0,1) ne ' ';

436

}

437

}

438

next;

439

}

440

# References

441

elsif (/^REFERENCE\s+\S/) {

442

if($annotation) {

443

my @refs = $self->_read_GenBank_References(\$buffer);

444

foreach my $ref ( @refs ) {

445

$annotation->add_Annotation('reference',$ref);

446

}

447

} else {

448

while(defined($buffer = $self->_readline())) {

449

last if substr($buffer,0,1) ne ' ';

450

}

451

}

452

next;

453

}

454

# Project

455

elsif (/^PROJECT\s+(\S.*)/) {

456

if ($annotation) {

457

my $project = Bio::Annotation::SimpleValue->new(-value => $1);

458

$annotation->add_Annotation('project',$project);

459

}

460

}

461

# Comments

462

elsif (/^COMMENT\s+(\S.*)/) {

463

if($annotation) {

464

my $comment = $1;

465

while (defined($_ = $self->_readline)) {

466

last if (/^\S/);

467

$comment .= $_;

468

}

469

$comment =~ s/\n/ /g;

470

$comment =~ s/ +/ /g;

471

$annotation->add_Annotation('comment',

472

Bio::Annotation::Comment->new(-text => $comment,

473

-tagname => 'comment'));

474

$buffer = $_;

475

} else {

476

while(defined($buffer = $self->_readline())) {

477

last if substr($buffer,0,1) ne ' ';

478

}

479

}

480

next;

481

}

482

# Corresponding Genbank nucleotide id, Genpept only

483

elsif( /^DB(?:SOURCE|LINK)\s+(\S.+)/ ) {

484

if ($annotation) {

485

my $dbsource = $1;

486

while (defined($_ = $self->_readline)) {

487

last if (/^\S/);

488

$dbsource .= $_;

489

}

490

# deal with UniProKB dbsources

491

if( $dbsource =~ s/(UniProt(?:KB)?|swissprot):\s+locus\s+(\S+)\,.+\n// ) {

492

$annotation->add_Annotation

493

('dblink',

494

Bio::Annotation::DBLink->new

495

(-primary_id => $2,

496

-database => $1,

497

-tagname => 'dblink'));

498

if( $dbsource =~ s/\s+created:\s+([^\.]+)\.\n// ) {

499

$annotation->add_Annotation

500

('swissprot_dates',

501

Bio::Annotation::SimpleValue->new

502

(-tagname => 'date_created',

503

-value => $1));

504

}

505

while( $dbsource =~ s/\s+(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g ) {

506

$annotation->add_Annotation

507

('swissprot_dates',

508

Bio::Annotation::SimpleValue->new

509

(-tagname => 'date_updated',

510

-value => $2));

511

}

512

$dbsource =~ s/\n/ /g;

513

if( $dbsource =~ s/\s+xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ ) {

514

# will use $i to determine even or odd

515

# for swissprot the accessions are paired

516

my $i = 0;

517

for my $dbsrc ( split(/,\s+/,$1) ) {

518

if( $dbsrc =~ /(\S+)\.(\d+)/ ||

519

$dbsrc =~ /(\S+)/ ) {

520

my ($id,$version) = ($1,$2);

521

$version ='' unless defined $version;

522

my $db;

523

if( $id =~ /^\d\S{3}/) {

524

$db = 'PDB';

525

} else {

526

$db = ($i++ % 2 ) ? 'GenPept' : 'GenBank';

527

}

528

$annotation->add_Annotation

529

('dblink',

530

Bio::Annotation::DBLink->new

531

(-primary_id => $id,

532

-version => $version,

533

-database => $db,

534

-tagname => 'dblink'));

535

}

536

}

537

} elsif( $dbsource =~ s/\s+xrefs:\s+(.+)\s+xrefs/xrefs/i ) {

538

# download screwed up and ncbi didn't put acc in for gi numbers

539

my $i = 0;

540

for my $id ( split(/\,\s+/,$1) ) {

541

my ($acc,$db);

542

if( $id =~ /gi:\s+(\d+)/ ) {

543

$acc= $1;

544

$db = ($i++ % 2 ) ? 'GenPept' : 'GenBank';

545

} elsif( $id =~ /pdb\s+accession\s+(\S+)/ ) {

546

$acc= $1;

547

$db = 'PDB';

548

} else {

549

$acc= $id;

550

$db = '';

551

}

552

$annotation->add_Annotation

553

('dblink',

554

Bio::Annotation::DBLink->new

555

(-primary_id => $acc,

556

-database => $db,

557

-tagname => 'dblink'));

558

}

559

} else {

560

$self->debug("Cannot match $dbsource\n");

561

}

562

if( $dbsource =~ s/xrefs\s+$non\-sequence\s+databases$:\s+

563

((?:\S+,\s+)+\S+)//x ) {

564

for my $id ( split(/\,\s+/,$1) ) {

565

my $db;

566

# this is because GenBank dropped the spaces!!!

567

# I'm sure we're not going to get this right

568

##if( $id =~ s/^://i ) {

569

## $db = $1;

570

##}

571

$db = substr($id,0,index($id,':'));

572

if (! exists $DBSOURCE{ $db }) {

573

$db = ''; # do we want 'GenBank' here?

574

}

575

$id = substr($id,index($id,':')+1);

576

$annotation->add_Annotation

577

('dblink',

578

Bio::Annotation::DBLink->new

579

(-primary_id => $id,

580

-database => $db,

581

-tagname => 'dblink'));

582

}

583

}

584

585

} else {

586

if( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)\.(\d+)/ ) {

587

my ($db,$id,$version) = ($1,$2,$3);

588

$annotation->add_Annotation

589

('dblink',

590

Bio::Annotation::DBLink->new

591

(-primary_id => $id,

592

-version => $version,

593

-database => $db || 'GenBank',

594

-tagname => 'dblink'));

595

} elsif ( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)/ ) {

596

my ($db,$id) = ($1,$2);

597

$annotation->add_Annotation

598

('dblink',

599

Bio::Annotation::DBLink->new

600

(-primary_id => $id,

601

-database => $db || 'GenBank',

602

-tagname => 'dblink'));

603

} elsif ( $dbsource =~ /(\S+)([\.:])\s*(\S+)/ ) {

604

my ($db, $version);

605

my @ids = ();

606

if ($2 eq ':') {

607

$db = $1;

608

# Genbank 192 release notes say this: "The second field can consist of

609

# multiple comma-separated identifiers, if a sequence record has

610

# multiple DBLINK cross-references of a given type."

611

# For example: DBLINK Project:100,200,300"

612

@ids = split (/,/, $3);

613

} else {

614

($db, $version) = ('GenBank', $3);

615

$ids[0] = $1;

616

}

617

618

foreach my $id (@ids) {

619

$annotation->add_Annotation('dblink',

267

my $buffer;

268

my ( @acc, @features );

269

my ( $display_id, $annotation );

270

my $species;

271

272

# initialize; we may come here because of starting over

273

@features = ();

274

$annotation = undef;

275

@acc = ();

276

$species = undef;

277

%params = ( -verbose => $self->verbose ); # reset hash

278

local ($/) = "\n";

279

while ( defined( $buffer = $self->_readline() ) ) {

280

last if index( $buffer, 'LOCUS ' ) == 0;

281

}

282

return unless defined $buffer; # end of file

283

$buffer =~ /^LOCUS\s+(\S.*)$/o

284

|| $self->throw(

285

"GenBank stream with bad LOCUS line. Not GenBank in my book. Got '$buffer'"

286

);

287

288

my @tokens = split( ' ', $1 );

289

290

# this is important to have the id for display in e.g. FTHelper,

291

# otherwise you won't know which entry caused an error

292

$display_id = shift(@tokens);

293

$params{'-display_id'} = $display_id;

294

295

# may still be useful if we don't want the seq

296

my $seqlength = shift(@tokens);

297

if ( exists $VALID_ALPHABET{$seqlength} ) {

298

299

# moved one token too far. No locus name?

300

$self->warn(

301

"Bad LOCUS name? Changing [$params{'-display_id'}] to 'unknown' and length to $display_id"

302

);

303

$params{'-display_id'} = 'unknown';

304

$params{'-length'} = $display_id;

305

306

# add token back...

307

unshift @tokens, $seqlength;

308

}

309

else {

310

$params{'-length'} = $seqlength;

311

}

312

313

# the alphabet of the entry

314

# shouldn't assign alphabet unless one is specifically designated (such as for rc files)

315

my $alphabet = lc( shift @tokens );

316

$params{'-alphabet'} =

317

( exists $VALID_ALPHABET{$alphabet} )

318

? $VALID_ALPHABET{$alphabet}

319

: $self->warn("Unknown alphabet: $alphabet");

320

321

# for aa there is usually no 'molecule' (mRNA etc)

322

if ( $params{'-alphabet'} eq 'protein' ) {

323

$params{'-molecule'} = 'PRT';

324

}

325

else {

326

$params{'-molecule'} = shift(@tokens);

327

}

328

329

# take care of lower case issues

330

if ( $params{'-molecule'} eq 'dna' || $params{'-molecule'} eq 'rna' ) {

331

$params{'-molecule'} = uc $params{'-molecule'};

332

}

333

$self->debug( "Unrecognized molecule type:" . $params{'-molecule'} )

334

if !exists( $VALID_MOLTYPE{ $params{'-molecule'} } );

335

my $circ = shift(@tokens);

336

if ( $circ eq 'circular' ) {

337

$params{'-is_circular'} = 1;

338

$params{'-division'} = shift(@tokens);

339

}

340

else {

341

# 'linear' or 'circular' may actually be omitted altogether

342

$params{'-division'} =

343

( CORE::length($circ) == 3 ) ? $circ : shift(@tokens);

344

}

345

my $date = join( ' ', @tokens ); # we lump together the rest

346

347

# this is per request bug #1513

348

# we can handle

349

# 9-10-2003

350

# 9-10-03

351

# 09-10-2003

352

# 09-10-03

353

if ( $date =~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/ ) {

354

if ( length($date) < 11 ) {

355

356

# improperly formatted date

357

# But we'll be nice and fix it for them

358

my ( $d, $m, $y ) = ( $2, $3, $4 );

359

if ( length($d) == 1 ) {

360

$d = "0$d";

361

}

362

363

# guess the century here

364

if ( length($y) == 2 ) {

365

if ( $y > 60 ) { # arbitrarily guess that '60' means 1960

366

$y = "19$y";

367

}

368

else {

369

$y = "20$y";

370

}

371

$self->warn(

372

"Date was malformed, guessing the century for $date to be $y\n"

373

);

374

}

375

$params{'-dates'} = [ join( '-', $d, $m, $y ) ];

376

}

377

else {

378

$params{'-dates'} = [$date];

379

}

380

}

381

382

# set them all at once

383

$builder->add_slot_value(%params);

384

%params = ();

385

386

# parse the rest if desired, otherwise start over

387

if ( !$builder->want_object() ) {

388

$builder->make_object();

389

next RECORDSTART;

390

}

391

392

# set up annotation depending on what the builder wants

393

if ( $builder->want_slot('annotation') ) {

394

$annotation = Bio::Annotation::Collection->new();

395

}

396

$buffer = $self->_readline();

397

until ( !defined($buffer) ) {

398

$_ = $buffer;

399

400

# Description line(s)

401

if (/^DEFINITION\s+(\S.*\S)/) {

402

my @desc = ($1);

403

while ( defined( $_ = $self->_readline ) ) {

404

if (/^\s+(.*)/) { push( @desc, $1 ); next }

405

last;

406

}

407

$builder->add_slot_value( -desc => join( ' ', @desc ) );

408

409

# we'll continue right here because DEFINITION always comes

410

# at the top of the entry

411

$buffer = $_;

412

}

413

414

# accession number (there can be multiple accessions)

415

if (/^ACCESSION\s+(\S.*\S)/) {

416

push( @acc, split( /\s+/, $1 ) );

417

while ( defined( $_ = $self->_readline ) ) {

418

/^\s+(.*)/ && do { push( @acc, split( /\s+/, $1 ) ); next };

419

last;

420

}

421

$buffer = $_;

422

next;

423

}

424

425

# PID

426

elsif (/^PID\s+(\S+)/) {

427

$params{'-pid'} = $1;

428

}

429

430

# Version number

431

elsif (/^VERSION\s+(\S.+)$/) {

432

my ( $acc, $gi ) = split( ' ', $1 );

433

if ( $acc =~ /^\w+\.(\d+)/ ) {

434

$params{'-version'} = $1;

435

$params{'-seq_version'} = $1;

436

}

437

if ( $gi && ( index( $gi, "GI:" ) == 0 ) ) {

438

$params{'-primary_id'} = substr( $gi, 3 );

439

}

440

}

441

442

# Keywords

443

elsif (/^KEYWORDS\s+(\S.*)/) {

444

my @kw = split( /\s*\;\s*/, $1 );

445

while ( defined( $_ = $self->_readline ) ) {

446

chomp;

447

/^\s+(.*)/

448

&& do { push( @kw, split( /\s*\;\s*/, $1 ) ); next };

449

last;

450

}

451

452

@kw && $kw[-1] =~ s/\.$//;

453

$params{'-keywords'} = \@kw;

454

$buffer = $_;

455

next;

456

}

457

458

# Organism name and phylogenetic information

459

elsif (/^SOURCE\s+\S/) {

460

if ( $builder->want_slot('species') ) {

461

$species = $self->_read_GenBank_Species( \$buffer );

462

$builder->add_slot_value( -species => $species );

463

}

464

else {

465

while ( defined( $buffer = $self->_readline() ) ) {

466

last if substr( $buffer, 0, 1 ) ne ' ';

467

}

468

}

469

next;

470

}

471

472

# References

473

elsif (/^REFERENCE\s+\S/) {

474

if ($annotation) {

475

my @refs = $self->_read_GenBank_References( \$buffer );

476

foreach my $ref (@refs) {

477

$annotation->add_Annotation( 'reference', $ref );

478

}

479

}

480

else {

481

while ( defined( $buffer = $self->_readline() ) ) {

482

last if substr( $buffer, 0, 1 ) ne ' ';

483

}

484

}

485

next;

486

}

487

488

# Project

489

elsif (/^PROJECT\s+(\S.*)/) {

490

if ($annotation) {

491

my $project =

492

Bio::Annotation::SimpleValue->new( -value => $1 );

493

$annotation->add_Annotation( 'project', $project );

494

}

495

}

496

497

# Comments

498

elsif (/^COMMENT\s+(\S.*)/) {

499

if ($annotation) {

500

my $comment = $1;

501

while ( defined( $_ = $self->_readline ) ) {

502

last if (/^\S/);

503

$comment .= $_;

504

}

505

$comment =~ s/\n/ /g;

506

$comment =~ s/ +/ /g;

507

$annotation->add_Annotation(

508

'comment',

509

Bio::Annotation::Comment->new(

510

-text => $comment,

511

-tagname => 'comment'

512

)

513

);

514

$buffer = $_;

515

}

516

else {

517

while ( defined( $buffer = $self->_readline() ) ) {

518

last if substr( $buffer, 0, 1 ) ne ' ';

519

}

520

}

521

next;

522

}

523

524

# Corresponding Genbank nucleotide id, Genpept only

525

elsif (/^DB(?:SOURCE|LINK)\s+(\S.+)/) {

526

if ($annotation) {

527

my $dbsource = $1;

528

while ( defined( $_ = $self->_readline ) ) {

529

last if (/^\S/);

530

$dbsource .= $_;

531

}

532

533

# deal with UniProKB dbsources

534

if ( $dbsource =~

535

s/(UniProt(?:KB)?|swissprot):\s+locus\s+(\S+)\,.+\n// )

536

{

537

$annotation->add_Annotation(

538

'dblink',

620

539

Bio::Annotation::DBLink->new(

621

-primary_id => $id,

622

-version => $version,

623

-database => $db,

624

-tagname => 'dblink')

540

-primary_id => $2,

541

-database => $1,

542

-tagname => 'dblink'

543

)

625

544

);

626

}

627

} else {

628

$self->warn("Unrecognized DBSOURCE data: $dbsource\n");

629

}

630

}

631

632

$buffer = $_;

633

} else {

634

while(defined($buffer = $self->_readline())) {

635

last if substr($buffer,0,1) ne ' ';

636

}

637

}

638

next;

639

}

640

# Exit at start of Feature table, or start of sequence

641

last if( /^(FEATURES|ORIGIN)/ );

642

# Get next line and loop again

643

$buffer = $self->_readline;

644

}

645

return unless defined $buffer;

646

647

# add them all at once for efficiency

648

$builder->add_slot_value(-accession_number => shift(@acc),

649

-secondary_accessions => \@acc,

650

%params);

651

$builder->add_slot_value(-annotation => $annotation) if $annotation;

652

%params = (); # reset before possible re-use to avoid setting twice

653

654

# start over if we don't want to continue with this entry

655

if(! $builder->want_object()) {

656

$builder->make_object();

657

next RECORDSTART;

658

}

659

# some "minimal" formats may not necessarily have a feature table

660

if($builder->want_slot('features') && defined($_) && /^FEATURES/o) {

661

# need to read the first line of the feature table

662

$buffer = $self->_readline;

663

# DO NOT read lines in the while condition -- this is done as a side

664

# effect in _read_FTHelper_GenBank!

665

666

# part of new circular spec:

667

# commented out for now until kinks worked out

668

#my $sourceEnd = 0;

669

#$sourceEnd = $2 if ($buffer =~ /(\d+?)\.\.(\d+?)$/);

670

671

while( defined($buffer) ) {

672

# check immediately -- not at the end of the loop

673

# note: GenPept entries obviously do not have a BASE line

674

last if( $buffer =~ /^BASE|ORIGIN|CONTIG|WGS/o);

675

676

# slurp in one feature at a time -- at return, the start of

677

# the next feature will have been read already, so we need

678

# to pass a reference, and the called method must set this

679

# to the last line read before returning

680

681

my $ftunit = $self->_read_FTHelper_GenBank(\$buffer);

682

683

# implement new circular spec: features that cross the origin are now

684

# seamless instead of being 2 separate joined features

685

# commented out until kinks get worked out

686

#if ((! $args{'-nojoin'}) && $ftunit->{'loc'} =~ /^join$(\d+?)\.\.(\d+?),(\d+?)..(\d+?)$$/

687

#&& $sourceEnd == $2 && $3 == 1) {

688

#my $start = $1;

689

#my $end = $2 + $4;

690

#$ftunit->{'loc'} = "$start..$end";

691

692

693

# fix suggested by James Diggans

694

695

if( !defined $ftunit ) {

696

# GRRRR. We have fallen over. Try to recover

697

$self->warn("Unexpected error in feature table for ".$params{'-display_id'}." Skipping feature, attempting to recover");

698

unless( ($buffer =~ /^\s{5,5}\S+/o) or

699

($buffer =~ /^\S+/o)) {

700

$buffer = $self->_readline();

701

}

702

next; # back to reading FTHelpers

703

}

704

705

# process ftunit

706

my $feat =

707

$ftunit->_generic_seqfeature($self->location_factory(),

708

$display_id);

709

# add taxon_id from source if available

710

if($species && ($feat->primary_tag eq 'source') &&

711

$feat->has_tag('db_xref') && (! $species->ncbi_taxid() ||

712

($species->ncbi_taxid && $species->ncbi_taxid =~ /^list/))) {

713

foreach my $tagval ($feat->get_tag_values('db_xref')) {

714

if(index($tagval,"taxon:") == 0) {

715

$species->ncbi_taxid(substr($tagval,6));

716

last;

717

}

718

}

719

}

720

# add feature to list of features

721

push(@features, $feat);

722

}

723

$builder->add_slot_value(-features => \@features);

724

$_ = $buffer;

725

}

726

if( defined ($_) ) {

727

if( /^CONTIG/o ) {

728

my @contig;

729

my $ctg = '';

730

while($_ !~ m{^//}) { # end of file

731

$_ =~ /^(?:CONTIG)?\s+(.*)/;

732

$ctg .= $1;

733

$_ = $self->_readline;

734

}

735

if ($ctg) {

736

$annotation->add_Annotation(

737

Bio::Annotation::SimpleValue->new(-tagname => 'contig',

738

-value => $ctg )

739

);

740

}

741

$self->_pushback($_);

742

} elsif( /^WGS|WGS_SCAFLD\s+/o ) { # catch WGS/WGS_SCAFLD lines

743

while($_ =~ s/(^WGS|WGS_SCAFLD)\s+//){ # gulp lines

744

chomp;

745

$annotation->add_Annotation(

746

Bio::Annotation::SimpleValue->new(-value => $_,

747

-tagname => lc($1)));

748

$_ = $self->_readline;

749

}

750

} elsif(! m{^(ORIGIN|//)} ) { # advance to the sequence, if any

751

while (defined( $_ = $self->_readline) ) {

752

last if m{^(ORIGIN|//)};

753

}

754

}

755

}

756

if(! $builder->want_object()) {

757

$builder->make_object(); # implicit end-of-object

758

next RECORDSTART;

759

}

760

if($builder->want_slot('seq')) {

761

# the fact that we want a sequence does not necessarily mean that

762

# there also is a sequence ...

763

if(defined($_) && s/^ORIGIN\s+//) {

764

chomp;

765

if( $annotation && length($_) > 0 ) {

766

$annotation->add_Annotation('origin',

767

Bio::Annotation::SimpleValue->new(-tagname => 'origin',

768

-value => $_));

769

}

770

my $seqc = '';

771

while( defined($_ = $self->_readline) ) {

772

m{^//} && last;

773

$_ = uc($_);

774

s/[^A-Za-z]//g;

775

$seqc .= $_;

776

}

777

#$self->debug("sequence length is ". length($seqc) ."\n");

778

$builder->add_slot_value(-seq => $seqc);

779

}

780

} elsif ( defined($_) && (substr($_,0,2) ne '//')) {

781

# advance to the end of the record

782

while( defined($_ = $self->_readline) ) {

783

last if substr($_,0,2) eq '//';

784

}

785

}

786

# Unlikely, but maybe the sequence is so weird that we don't want it

787

# anymore. We don't want to return undef if the stream's not exhausted

788

# yet.

789

$seq = $builder->make_object();

790

next RECORDSTART unless $seq;

791

last RECORDSTART;

792

} # end while RECORDSTART

545

if ( $dbsource =~ s/\s+created:\s+([^\.]+)\.\n// ) {

546

$annotation->add_Annotation(

547

'swissprot_dates',

548

Bio::Annotation::SimpleValue->new(

549

-tagname => 'date_created',

550

-value => $1

551

)

552

);

553

}

554

while ( $dbsource =~

555

s/\s+(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g

556

)

557

{

558

$annotation->add_Annotation(

559

'swissprot_dates',

560

Bio::Annotation::SimpleValue->new(

561

-tagname => 'date_updated',

562

-value => $2

563

)

564

);

565

}

566

$dbsource =~ s/\n/ /g;

567

if ( $dbsource =~

568

s/\s+xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ )

569

{

570

# will use $i to determine even or odd

571

# for swissprot the accessions are paired

572

my $i = 0;

573

for my $dbsrc ( split( /,\s+/, $1 ) ) {

574

if ( $dbsrc =~ /(\S+)\.(\d+)/

575

|| $dbsrc =~ /(\S+)/ )

576

{

577

my ( $id, $version ) = ( $1, $2 );

578

$version = '' unless defined $version;

579

my $db;

580

if ( $id =~ /^\d\S{3}/ ) {

581

$db = 'PDB';

582

}

583

else {

584

$db =

585

( $i++ % 2 ) ? 'GenPept' : 'GenBank';

586

}

587

$annotation->add_Annotation(

588

'dblink',

589

Bio::Annotation::DBLink->new(

590

-primary_id => $id,

591

-version => $version,

592

-database => $db,

593

-tagname => 'dblink'

594

)

595

);

596

}

597

}

598

}

599

elsif (

600

$dbsource =~ s/\s+xrefs:\s+(.+)\s+xrefs/xrefs/i )

601

{

602

# download screwed up and ncbi didn't put acc in for gi numbers

603

my $i = 0;

604

for my $id ( split( /\,\s+/, $1 ) ) {

605

my ( $acc, $db );

606

if ( $id =~ /gi:\s+(\d+)/ ) {

607

$acc = $1;

608

$db = ( $i++ % 2 ) ? 'GenPept' : 'GenBank';

609

}

610

elsif ( $id =~ /pdb\s+accession\s+(\S+)/ ) {

611

$acc = $1;

612

$db = 'PDB';

613

}

614

else {

615

$acc = $id;

616

$db = '';

617

}

618

$annotation->add_Annotation(

619

'dblink',

620

Bio::Annotation::DBLink->new(

621

-primary_id => $acc,

622

-database => $db,

623

-tagname => 'dblink'

624

)

625

);

626

}

627

}

628

else {

629

$self->debug("Cannot match $dbsource\n");

630

}

631

if (

632

$dbsource =~

633

s/xrefs\s+$non\-sequence\s+databases$:\s+

634

((?:\S+,\s+)+\S+)//x

635

)

636

{

637

for my $id ( split( /\,\s+/, $1 ) ) {

638

my $db;

639

640

# this is because GenBank dropped the spaces!!!

641

# I'm sure we're not going to get this right

642

##if( $id =~ s/^://i ) {

643

## $db = $1;

644

##}

645

$db = substr( $id, 0, index( $id, ':' ) );

646

if ( !exists $DBSOURCE{$db} ) {

647

$db = ''; # do we want 'GenBank' here?

648

}

649

$id = substr( $id, index( $id, ':' ) + 1 );

650

$annotation->add_Annotation(

651

'dblink',

652

Bio::Annotation::DBLink->new(

653

-primary_id => $id,

654

-database => $db,

655

-tagname => 'dblink'

656

)

657

);

658

}

659

}

660

661

}

662

else {

663

if ( $dbsource =~

664

/^(\S*?):?\s*accession\s+(\S+)\.(\d+)/ )

665

{

666

my ( $db, $id, $version ) = ( $1, $2, $3 );

667

$annotation->add_Annotation(

668

'dblink',

669

Bio::Annotation::DBLink->new(

670

-primary_id => $id,

671

-version => $version,

672

-database => $db || 'GenBank',

673

-tagname => 'dblink'

674

)

675

);

676

}

677

elsif ( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)/ ) {

678

my ( $db, $id ) = ( $1, $2 );

679

$annotation->add_Annotation(

680

'dblink',

681

Bio::Annotation::DBLink->new(

682

-primary_id => $id,

683

-database => $db || 'GenBank',

684

-tagname => 'dblink'

685

)

686

);

687

}

688

elsif ( $dbsource =~ /(\S+)([\.:])\s*(\S+)/ ) {

689

my ( $db, $version );

690

my @ids = ();

691

if ( $2 eq ':' ) {

692

$db = $1;

693

694

# Genbank 192 release notes say this: "The second field can consist of

695

# multiple comma-separated identifiers, if a sequence record has

696

# multiple DBLINK cross-references of a given type."

697

# For example: DBLINK Project:100,200,300"

698

@ids = split( /,/, $3 );

699

}

700

else {

701

( $db, $version ) = ( 'GenBank', $3 );

702

$ids[0] = $1;

703

}

704

705

foreach my $id (@ids) {

706

$annotation->add_Annotation(

707

'dblink',

708

Bio::Annotation::DBLink->new(

709

-primary_id => $id,

710

-version => $version,

711

-database => $db,

712

-tagname => 'dblink'

713

)

714

);

715

}

716

}

717

else {

718

$self->warn(

719

"Unrecognized DBSOURCE data: $dbsource\n");

720

}

721

}

722

723

$buffer = $_;

724

}

725

else {

726

while ( defined( $buffer = $self->_readline() ) ) {

727

last if substr( $buffer, 0, 1 ) ne ' ';

728

}

729

}

730

next;

731

}

732

733

# Exit at start of Feature table, or start of sequence

734

last if (/^(FEATURES|ORIGIN)/);

735

736

# Get next line and loop again

737

$buffer = $self->_readline;

738

}

739

return unless defined $buffer;

740

741

# add them all at once for efficiency

742

$builder->add_slot_value(

743

-accession_number => shift(@acc),

744

-secondary_accessions => \@acc,

745

%params

746

);

747

$builder->add_slot_value( -annotation => $annotation ) if $annotation;

748

%params = (); # reset before possible re-use to avoid setting twice

749

750

# start over if we don't want to continue with this entry

751

if ( !$builder->want_object() ) {

752

$builder->make_object();

753

next RECORDSTART;

754

}

755

756

# some "minimal" formats may not necessarily have a feature table

757

if ( $builder->want_slot('features') && defined($_) && /^FEATURES/o ) {

758

759

# need to read the first line of the feature table

760

$buffer = $self->_readline;

761

762

# DO NOT read lines in the while condition -- this is done as a side

763

# effect in _read_FTHelper_GenBank!

764

765

# part of new circular spec:

766

# commented out for now until kinks worked out

767

#my $sourceEnd = 0;

768

#$sourceEnd = $2 if ($buffer =~ /(\d+?)\.\.(\d+?)$/);

769

770

while ( defined($buffer) ) {

771

772

# check immediately -- not at the end of the loop

773

# note: GenPept entries obviously do not have a BASE line

774

last if ( $buffer =~ /^BASE|ORIGIN|CONTIG|WGS/o );

775

776

# slurp in one feature at a time -- at return, the start of

777

# the next feature will have been read already, so we need

778

# to pass a reference, and the called method must set this

779

# to the last line read before returning

780

781

my $ftunit = $self->_read_FTHelper_GenBank( \$buffer );

782

783

# implement new circular spec: features that cross the origin are now

784

# seamless instead of being 2 separate joined features

785

# commented out until kinks get worked out

786

#if ((! $args{'-nojoin'}) && $ftunit->{'loc'} =~ /^join$(\d+?)\.\.(\d+?),(\d+?)..(\d+?)$$/

787

#&& $sourceEnd == $2 && $3 == 1) {

788

#my $start = $1;

789

#my $end = $2 + $4;

790

#$ftunit->{'loc'} = "$start..$end";

791

792

793

# fix suggested by James Diggans

794

795

if ( !defined $ftunit ) {

796

797

# GRRRR. We have fallen over. Try to recover

798

$self->warn( "Unexpected error in feature table for "

799

. $params{'-display_id'}

800

. " Skipping feature, attempting to recover" );

801

unless ( ( $buffer =~ /^\s{5,5}\S+/o )

802

or ( $buffer =~ /^\S+/o ) )

803

{

804

$buffer = $self->_readline();

805

}

806

next; # back to reading FTHelpers

807

}

808

809

# process ftunit

810

my $feat =

811

$ftunit->_generic_seqfeature( $self->location_factory(),

812

$display_id );

813

814

# add taxon_id from source if available

815

if (

816

$species

817

&& ( $feat->primary_tag eq 'source' )

818

&& $feat->has_tag('db_xref')

819

&& (

820

!$species->ncbi_taxid()

821

|| ( $species->ncbi_taxid

822

&& $species->ncbi_taxid =~ /^list/ )

823

)

824

)

825

{

826

foreach my $tagval ( $feat->get_tag_values('db_xref') ) {

827

if ( index( $tagval, "taxon:" ) == 0 ) {

828

$species->ncbi_taxid( substr( $tagval, 6 ) );

829

last;

830

}

831

}

832

}

833

834

# add feature to list of features

835

push( @features, $feat );

836

}

837

$builder->add_slot_value( -features => \@features );

838

$_ = $buffer;

839

}

840

841

if ( defined($_) ) {

842

# CONTIG lines: TODO, this needs to be cleaned up

843

if (/^CONTIG\s+(.*)/o) {

844

my $ctg = $1;

845

while ( defined( $_ = $self->_readline)) {

846

last if m{^ORIGIN|//}o;

847

s/\s+(.*)/$1/;

848

$ctg .= $_;

849

}

850

if ($ctg) {

851

$annotation->add_Annotation(

852

Bio::Annotation::SimpleValue->new(

853

-tagname => 'contig',

854

-value => $ctg

855

)

856

);

857

}

858

}

859

elsif (/^WGS|WGS_SCAFLD\s+/o) { # catch WGS/WGS_SCAFLD lines

860

while ( $_ =~ s/(^WGS|WGS_SCAFLD)\s+// ) { # gulp lines

861

chomp;

862

$annotation->add_Annotation(

863

Bio::Annotation::SimpleValue->new(

864

-value => $_,

865

-tagname => lc($1)

866

)

867

);

868

$_ = $self->_readline;

869

}

870

}

871

elsif ( !m{^ORIGIN|//}o ) { # advance to the sequence, if any

872

while ( defined( $_ = $self->_readline ) ) {

873

last if m{^(ORIGIN|//)};

874

}

875

}

876

}

877

if ( !$builder->want_object() ) {

878

$builder->make_object(); # implicit end-of-object

879

next RECORDSTART;

880

}

881

if ( $builder->want_slot('seq') ) {

882

# the fact that we want a sequence does not necessarily mean that

883

# there also is a sequence ...

884

if ( defined($_) && s/^ORIGIN\s+// ) {

885

if ( $annotation && length($_) > 0 ) {

886

$annotation->add_Annotation(

887

'origin',

888

Bio::Annotation::SimpleValue->new(

889

-tagname => 'origin',

890

-value => $_

891

)

892

);

893

}

894

my $seqc = '';

895

while ( defined( $_ = $self->_readline ) ) {

896

last if m{^//};

897

$_ = uc($_);

898

s/[^A-Za-z]//g;

899

$seqc .= $_;

900

}

901

902

$builder->add_slot_value( -seq => $seqc );

903

}

904

}

905

elsif ( defined($_) && ( substr( $_, 0, 2 ) ne '//' ) ) {

906

907

# advance to the end of the record

908

while ( defined( $_ = $self->_readline ) ) {

909

last if substr( $_, 0, 2 ) eq '//';

910

}

911

}

912

913

# Unlikely, but maybe the sequence is so weird that we don't want it

914

# anymore. We don't want to return undef if the stream's not exhausted

915

# yet.

916

$seq = $builder->make_object();

917

next RECORDSTART unless $seq;

918

last RECORDSTART;

919

} # end while RECORDSTART

793

920

794

921

return $seq;

795

922

}

Older »