1
1
# -*-Perl-*- mode (to keep my emacs happy)
2
# $Id: SeqIO.t,v 1.77 2003/12/17 13:14:28 heikki Exp $
2
# $Id: SeqIO.t,v 1.99 2005/08/28 13:56:45 bosborne Exp $
5
use vars qw($DEBUG $TESTCOUNT);
6
$DEBUG = $ENV{'BIOPERLDEBUG'} || 0;
9
eval { require Test; };
15
plan tests => $TESTCOUNT;
7
eval { require Test; };
20
use Bio::SeqIO::MultiFile;
22
use Bio::Annotation::Collection;
26
19
# Set to -1 for release version, so warnings aren't printed
27
my $verbosity = $DEBUG ? 1 : -1;
30
# Basic read and/or write tests
20
my $verbose = $ENV{'BIOPERLDEBUG'} || 0;
22
# Basic read and/or write tests for SeqIO. Specific tests for
23
# given module should go into their own file.
25
my @formats = qw(gcg fasta raw pir tab ace );
26
# The following files or formats are failing: swiss genbank interpro embl
28
foreach my $format (@formats) {
29
print "======== $format ========\n" if $verbose;
38
my $str = Bio::SeqIO->new(-file=> Bio::Root::IO->catfile("t","data","test.$format"),
39
'-format' => $format);
40
ok $seq = $str->next_seq();
41
print "Sequence 1 of 2 from $format stream:\n", $seq->seq, "\n\n" if $DEBUG;
43
unless ($format eq 'raw') {
44
ok $seq->id, 'roa1_drome',"ID for format $format";
48
unless ($format eq 'gcg') { # GCG file can contain only one sequence
49
ok $seq = $str->next_seq();
50
print "Sequence 2 of 2 from $format stream:\n", $seq->seq, $seq->seq, "\n" if $DEBUG;
53
my $out = Bio::SeqIO->new('-file'=> ">". Bio::Root::IO->catfile("t","data","$format.out"),
54
'-format' => $format);
55
ok $out->write_seq($seq);
56
if ($format eq 'fasta') {
58
ok($id_type = $out->preferred_id_type('accession.version'), 'accession.version');
63
my @formats = qw(gcg fasta raw pir tab);
65
foreach my $format (@formats) {
66
print "======== $format ========\n" if $DEBUG;
36
my $str = Bio::SeqIO->new(-file=> Bio::Root::IO->catfile
37
("t","data","test.$format"),
39
ok $seq = $str->next_seq();
40
print "Sequence 1 of 2 from $format stream:\n", $seq->seq, "\n\n"
42
unless ($format eq 'raw') {
43
ok $seq->id, 'roa1_drome',"ID for format $format";
47
unless ($format eq 'gcg') { # GCG file can contain only one sequence
48
ok $seq = $str->next_seq();
49
print "Sequence 2 of 2 from $format stream:\n", $seq->seq,
50
$seq->seq, "\n" if $verbose;
53
my $out = Bio::SeqIO->new(-file => ">". Bio::Root::IO->catfile
54
("t","data","$format.out"),
56
ok $out->write_seq($seq);
57
if ($format eq 'fasta') {
59
ok($id_type = $out->preferred_id_type('accession.version'),
72
map { unlink Bio::Root::IO->catfile("t","data","$_.out") } @formats
76
my ($str, $seq,$ast,$temp,$mf,$ent,$out); # predeclare variables for strict
79
$str = Bio::SeqIO->new('-file' => Bio::Root::IO->catfile("t","data","seqfile.pir"),
82
$out = new Bio::SeqIO(-format => 'pir', -fh => \*STDOUT);
84
while( $seq = $str->next_seq()) {
85
# ok($seq->id, qr /^[PF]1/ );
87
$out->write_seq($seq) if $verbosity > 0;
90
$ast = Bio::SeqIO->new( '-format' => 'GenBank' ,
91
'-file' => Bio::Root::IO->catfile("t","data","roa1.genbank"));
92
$ast->verbose($verbosity);
93
my $as = $ast->next_seq();
94
ok $as->molecule, 'mRNA';
95
ok $as->alphabet, 'dna';
96
ok($as->primary_id, 3598416);
99
$ast = Bio::SeqIO->new( '-format' => 'genbank' ,
100
'-file' => Bio::Root::IO->catfile("t","data",
102
$ast->verbose($verbosity);
103
$as = $ast->next_seq();
104
ok $as->molecule, 'DNA';
105
ok $as->alphabet, 'dna';
106
ok($as->primary_id, 37539616);
107
ok($as->accession_number, 'NT_021877');
109
my ($cds) = grep { $_->primary_tag eq 'CDS' } $as->get_SeqFeatures();
110
ok(($cds->get_tag_values('transl_except'))[1],
111
'(pos:complement(4224..4226),aa:OTHER)');
113
$ast = Bio::SeqIO->new( '-format' => 'embl' ,
114
'-file' => Bio::Root::IO->catfile("t","data","roa1.dat"));
115
$ast->verbose($verbosity);
116
$as = $ast->next_seq();
118
ok($as->display_id, 'HSHNCPA1');
119
ok($as->accession_number, 'X79536');
120
ok($as->seq_version, 1);
122
ok($as->desc, 'H.sapiens mRNA for hnRNPcore protein A1');
123
ok($as->molecule, 'RNA');
124
ok($as->alphabet, 'rna');
125
ok(scalar $as->all_SeqFeatures(), 4);
126
ok($as->length, 1198);
127
ok($as->species->binomial(), 'Homo sapiens');
129
$ent = Bio::SeqIO->new( '-file' => Bio::Root::IO->catfile("t","data","test.embl"),
130
'-format' => 'embl');
132
$seq = $ent->next_seq();
134
ok(defined $seq->seq(), 1,
135
'failure to read Embl with ^ location and badly split double quotes');
136
ok(scalar $seq->annotation->get_Annotations('reference'), 3);
138
$out = Bio::SeqIO->new('-file'=> ">embl.out",
139
'-format' => 'embl');
141
ok($out->write_seq($seq),1,
142
'failure to write Embl format with ^ < and > locations');
146
my $kegg = Bio::SeqIO->new( '-format' => 'kegg' ,
147
'-file' => Bio::Root::IO->catfile("t","data","AHCYL1.kegg"));
150
$kegg = $kegg->next_seq();
152
ok($kegg->accession, '10768');
153
ok($kegg->display_id, 'AHCYL1');
154
ok($kegg->alphabet, 'dna');
156
ok($kegg->translate->seq);
157
ok(($kegg->annotation->get_Annotations('description'))[0]->text,
158
'S-adenosylhomocysteine hydrolase-like 1 [EC:3.3.1.1]');
159
ok( (grep {$_->database eq 'KO'}
160
$kegg->annotation->get_Annotations('dblink'))[0]->comment,
161
'adenosylhomocysteinase' );
162
ok( (grep {$_->database eq 'PATH'}
163
$kegg->annotation->get_Annotations('dblink'))[0]->primary_id,
167
$mf = Bio::SeqIO::MultiFile->new( '-format' => 'Fasta' ,
169
[ Bio::Root::IO->catfile("t","data","multi_1.fa"),
170
Bio::Root::IO->catfile("t","data","multi_2.fa")]);
175
while( $seq = $mf->next_seq() ) {
177
$temp = $seq->display_id;
184
$ast = Bio::SeqIO->new( '-verbosity' => $verbosity,
185
'-format' => 'swiss' ,
186
'-file' => Bio::Root::IO->catfile("t","data","roa1.swiss"));
187
$as = $ast->next_seq();
190
ok($as->id, 'ROA1_HUMAN', "id is ".$as->id);
191
#ok($as->primary_id, 'ROA1');
192
skip($as->primary_id =~ /^Bio::Seq::/, $as->primary_id, 'ROA1');
193
ok($as->length, 371);
194
ok($as->alphabet, 'protein');
195
ok($as->division, 'HUMAN');
196
ok(scalar $as->all_SeqFeatures(), 16);
198
ok(scalar $as->annotation->get_Annotations('reference'), 11);
200
($ent, $seq, $out,$as) = undef;
207
my $t_file = Bio::Root::IO->catfile("t","data","test.ace");
212
open BEFORE, $t_file;
217
my $a_in = Bio::SeqIO->new( -FILE => $t_file, -FORMAT => 'ace');
219
while (my $a = $a_in->next_seq) {
223
ok @a_seq, 3, 'wrong number of sequence objects';
225
my $esc_name = $a_seq[1]->display_id;
226
ok( $esc_name , 'Name; 4% strewn with \ various / escaped characters',
227
"bad unescaping of characters, $esc_name");
229
ok $a_seq[0]->alphabet, 'protein', 'alphabets incorrectly detected';
230
ok $a_seq[1]->alphabet, 'dna', 'alphabets incorrectly detected';
232
my $o_file = Bio::Root::IO->catfile("t","data","test.out.ace");
233
my $a_out = Bio::SeqIO->new( -FILE => "> $o_file", -FORMAT => 'ace');
235
foreach my $a (@a_seq) {
236
$a_out->write_seq($a) or $a_out_ok = 0;
238
undef($a_out); # Flush to disk
239
ok $a_out_ok,1,'error writing sequence';
251
ok( ($before and $after and ($before eq $after)),1,
252
'test output file differs from input');
256
# streaming & Bio::RichSeq creation
259
my $stream = Bio::SeqIO->new('-file' => Bio::Root::IO->catfile("t","data","test.genbank"),
260
'-format' => 'GenBank');
261
$stream->verbose($verbosity);
266
my @ids = qw(DDU63596 DDU63595 HUMBDNF);
267
my @tids = (44689, 44689, 9606);
268
my @tnames = ("Dictyostelium discoideum","Dictyostelium discoideum","Homo sapiens");
269
while($seq = $stream->next_seq()) {
271
ok $seq->display_id(), $ids[$seqnum];
272
$species = $seq->species();
273
@cl = $species->classification();
274
ok( $species->binomial(), $tnames[$seqnum],
275
'species parsing incorrect for genbank');
276
ok( $cl[3] ne $species->genus(), 1,
277
'genus duplicated in genbank parsing');
278
ok( $species->ncbi_taxid, $tids[$seqnum] );
283
ok $lasts->display_id(), "HUMBETGLOA";
284
my ($ref) = $lasts->annotation->get_Annotations('reference');
285
ok($ref->medline, 94173918);
288
$stream = Bio::SeqIO->new(
289
'-file' => Bio::Root::IO->catfile("t","data","test.genbank.noseq"),
290
'-format' => 'GenBank',
293
while($seq = $stream->next_seq()) {
295
ok $seq->display_id(), $ids[$seqnum];
296
} elsif( $seq->display_id eq 'M37762') {
297
ok( ($seq->get_keywords())[0], 'neurotrophic factor');
301
ok $seqnum, 5, "Total number of sequences in test file";
303
$ent = Bio::SeqIO->new( '-file' => Bio::Root::IO->catfile("t","data","test.embl"),
304
'-format' => 'embl');
305
$ent->verbose($verbosity);
306
$seq = $ent->next_seq();
307
$species = $seq->species();
308
@cl = $species->classification();
309
ok( $cl[3] ne $species->genus(), 1, 'genus duplicated in EMBL parsing');
312
$seq = Bio::SeqIO->new( '-format' => 'GenBank' ,
313
-file => Bio::Root::IO->catfile("t","data","testfuzzy.genbank"));
314
$seq->verbose($verbosity);
315
ok(defined($as = $seq->next_seq()));
317
my @features = $as->all_SeqFeatures();
319
my $lastfeature = pop @features;
321
# this is a split location; the root doesn't have strand
322
ok($lastfeature->strand, undef);
323
my $location = $lastfeature->location;
324
$location->verbose(-1); # silence the warning of undef seq_id()
325
# see above; splitlocs roots do not have a strand really
326
ok($location->strand, undef);
327
ok($location->start, 83202);
328
ok($location->end, 84996);
330
my @sublocs = $location->sub_Location();
333
my $loc = shift @sublocs;
334
ok($loc->start, 83202);
335
ok($loc->end, 83329);
336
ok($loc->strand, -1);
338
$loc = shift @sublocs;
339
ok($loc->start, 84248);
340
ok($loc->end, 84996);
343
$seq = Bio::SeqIO->new( '-format' => 'GenBank' ,
344
-file => ">".Bio::Root::IO->catfile("t","data","genbank.fuzzyout"));
345
$seq->verbose($verbosity);
346
ok($seq->write_seq($as));
347
unlink(Bio::Root::IO->catfile("t","data","genbank.fuzzyout"));
349
my $seqio = Bio::SeqIO->new( '-format' => 'swiss' ,
350
-file => Bio::Root::IO->catfile("t","data","swiss.dat"));
352
ok(defined( $seq = $seqio->next_seq));
354
# more tests to verify we are actually parsing correctly
355
skip($seq->primary_id =~ /^Bio::Seq/, $seq->primary_id, 'MA32');
356
ok($seq->display_id, 'MA32_HUMAN');
357
ok($seq->length, 282);
358
ok($seq->division, 'HUMAN');
359
ok($seq->alphabet, 'protein');
360
ok(scalar $seq->all_SeqFeatures(), 2);
362
my @genenames = qw(GC1QBP HABP1 SF2P32 C1QBP);
363
my ($ann) = $seq->annotation->get_Annotations('gene_name');
364
foreach my $gn ( $ann->get_all_values() ) {
365
ok ($gn, shift(@genenames));
367
ok $ann->value(-joins => [" AND "," OR "]), "GC1QBP OR HABP1 OR SF2P32 OR C1QBP";
369
# test for feature locations like ?..N
370
ok(defined( $seq = $seqio->next_seq));
372
skip($seq->primary_id =~ /^Bio::Seq/, $seq->primary_id, 'ACON');
373
ok($seq->display_id, 'ACON_CAEEL');
374
ok($seq->length, 788);
375
ok($seq->division, 'CAEEL');
376
ok($seq->alphabet, 'protein');
377
ok(scalar $seq->all_SeqFeatures(), 5);
379
foreach my $gn ( $seq->annotation->get_Annotations('gene_name') ) {
380
ok ($gn->value, 'F54H12.1');
384
# test species in swissprot -- this can be a n:n nightmare
385
ok ($seq = $seqio->next_seq());
386
my @sec_acc = $seq->get_secondary_accessions();
387
ok ($sec_acc[0], 'P29360');
388
ok ($sec_acc[1], 'Q63631');
389
ok ($seq->accession_number, 'P42655');
390
my @kw = $seq->get_keywords;
391
ok( $kw[0], 'Brain');
392
ok( $kw[1], 'Neurone');
393
ok ($kw[3], 'Multigene family');
394
ok ($seq->display_id, '143E_HUMAN');
395
ok ($seq->species->binomial, "Homo sapiens");
396
ok ($seq->species->common_name, "Human");
397
ok ($seq->species->ncbi_taxid, 9606);
399
ok ($seq = $seqio->next_seq());
400
ok ($seq->species->binomial, "Bos taurus");
401
ok ($seq->species->common_name, "Bovine");
402
ok ($seq->species->ncbi_taxid, 9913);
404
# multiple genes in swissprot
405
ok ($seq = $seqio->next_seq());
407
($ann) = $seq->annotation->get_Annotations("gene_name");
408
@genenames = qw(CALM1 CAM1 CALM CAM CALM2 CAM2 CAMB CALM3 CAM3 CAMC);
409
foreach my $gn ( $ann->get_all_values() ) {
410
ok ($gn, shift(@genenames));
412
ok $ann->value(-joins => [" AND "," OR "]),
413
"(CALM1 OR CAM1 OR CALM OR CAM) AND (CALM2 OR CAM2 OR CAMB) AND (CALM3 OR CAM3 OR CAMC)";
415
# test dos Linefeeds in gcg parser
416
$str = Bio::SeqIO->new('-file' => Bio::Root::IO->catfile("t","data","test_badlf.gcg"),
420
ok ( $seq = $str->next_seq());
421
ok(length($seq->seq) > 0 );
422
print "Sequence 1 of 1 from GCG stream:\n", $seq->seq, "\n" if( $DEBUG);
425
$str = new Bio::SeqIO(-format => 'genbank',
426
-file => Bio::Root::IO->catfile("t","data",
428
-verbose => $verbosity);
430
$seq = $str->next_seq;
431
@features = $seq->all_SeqFeatures();
433
ok($features[0]->start, 1);
434
ok($features[0]->end, 226);
435
$location = $features[1]->location;
436
ok($location->isa('Bio::Location::SplitLocationI'));
437
@sublocs = $location->sub_Location();
440
# version and primary ID - believe it or not, this wasn't working
441
ok ($seq->version, 1);
442
ok ($seq->seq_version, 1);
443
ok ($seq->primary_id, "5734104");
446
# test embl writing of a PrimarySeq
448
my $primaryseq = new Bio::PrimarySeq( -seq => 'AGAGAGAGATA',
452
-accession_number => 'myaccession');
454
my $embl = new Bio::SeqIO(-format => 'embl',
455
-verbose => $verbosity,
456
-file => ">primaryseq.embl");
458
ok($embl->write_seq($primaryseq));
461
$embl->write_seq($scalar);
465
unlink("primaryseq.embl");
468
# revcomp split location
469
my $gb = new Bio::SeqIO(-format => 'genbank',
470
-file => Bio::Root::IO->catfile(qw(t data revcomp_mrna.gb)));
472
$seq = $gb->next_seq();
474
$gb = new Bio::SeqIO(-format => 'genbank',
475
-file => ">tmp_revcomp_mrna.gb");
477
$gb->write_seq($seq);
479
ok(! -z "tmp_revcomp_mrna.gb");
481
# INSERT DIFFING CODE HERE
483
unlink("tmp_revcomp_mrna.gb");
486
# test secondary accessions in EMBL (bug #1332)
488
$seqio = new Bio::SeqIO(-format =>'embl', -file => Bio::Root::IO->catfile
489
( qw(t data ECAPAH02.embl)));
490
$seq = $seqio->next_seq;
492
ok($seq->accession_number, 'D10483');
493
ok($seq->seq_version, 2);
494
my @accs = $seq->get_secondary_accessions();
495
ok($accs[0], 'J01597');
496
ok($accs[-1], 'X56742');
498
$seqio = new Bio::SeqIO(-format => 'genbank',
499
-file => Bio::Root::IO->catfile(qw(t data
502
$seq = $seqio->next_seq;
503
@kw = $seq->get_keywords;
506
@sec_acc = $seq->get_secondary_accessions();
507
ok(scalar @sec_acc,14);
508
ok($sec_acc[-1], 'X56742');
511
### TPA TESTS - Thanks to Richard Adams ###
513
### test Third Party Annotation entries in EMBL/Gb format
514
# to ensure compatability with parsers.
517
$str = new Bio::SeqIO(-format =>'embl', -file => Bio::Root::IO->catfile
518
( qw(t data BN000066-tpa.embl)));
519
$seq = $str->next_seq;
521
ok($seq->accession_number, 'BN000066');
522
ok($seq->alphabet, 'dna');
523
ok($seq->display_id, 'AGA000066');
524
ok($seq->length, 5195);
525
ok($seq->division, 'INV');
526
ok($seq->get_dates, 2);
527
ok($seq->keywords, 'acetylcholinesterase; achE1 gene; Third Party Annotation; TPA.');
528
ok($seq->seq_version, 1);
529
ok($seq->feature_count, 15);
531
my $spec_obj = $seq->species;
532
ok ($spec_obj->common_name, 'African malaria mosquito');
533
ok ($spec_obj->species, 'gambiae');
534
ok ($spec_obj->genus, 'Anopheles');
535
ok ($spec_obj->binomial, 'Anopheles gambiae');
537
my $ac = $seq->annotation;
538
my $reference = ($ac->get_Annotations('reference') )[1];
539
ok ($reference->title,'"A novel acetylcholinesterase gene in mosquitoes codes for the insecticide target and is non-homologous to the ace gene in Drosophila"');
540
ok ($reference->authors,'Weill M., Fort P., Berthomi eu A., Dubois M.P., Pasteur N., Raymond M.');
541
my $cmmnt = ($ac->get_Annotations('comment') )[0];
542
ok($cmmnt->text, 'see also AJ488492 for achE-1 from Kisumu strain Third Party Annotation Database: This TPA record uses Anopheles gambiae trace archive data (http://trace.ensembl.org) ');
546
$str = new Bio::SeqIO(-format =>'genbank', -file => Bio::Root::IO->catfile
547
( qw(t data BK000016-tpa.gbk)));
548
$seq = $str->next_seq;
550
ok(defined $seq->seq);
551
ok($seq->accession_number, 'BK000016');
552
ok($seq->alphabet, 'dna');
553
ok($seq->display_id, 'BK000016');
554
ok($seq->length, 1162);
555
ok($seq->division, 'ROD');
556
ok($seq->get_dates, 1);
557
ok($seq->keywords, 'Third Party Annotation; TPA');
558
ok($seq->desc, 'TPA: Mus musculus pantothenate kinase 4 mRNA, partial cds.');
559
ok($seq->seq_version, 1);
560
ok($seq->feature_count, 2);
561
$spec_obj = $seq->species;
562
ok ($spec_obj->common_name, 'Mus musculus (house mouse)');
563
ok ($spec_obj->species, 'musculus');
564
ok ($spec_obj->genus, 'Mus');
565
ok ($spec_obj->binomial, 'Mus musculus');
566
$ac = $seq->annotation;
567
$reference = ($ac->get_Annotations('reference') )[0];
568
ok ($reference->pubmed, '11479594');
569
ok ($reference->medline, '21372465');
571
# validate that what is written is what is read
573
my $testfile = "testtpa.gbk";
574
$out = new Bio::SeqIO(-file => ">$testfile",
575
-format => 'genbank');
576
$out->write_seq($seq);
579
$str = new Bio::SeqIO(-format =>'genbank',
581
$seq = $str->next_seq;
583
ok(defined $seq->seq);
584
ok($seq->accession_number, 'BK000016');
585
ok($seq->alphabet, 'dna');
586
ok($seq->display_id, 'BK000016');
587
ok($seq->length, 1162);
588
ok($seq->division, 'ROD');
589
ok($seq->get_dates, 1);
590
ok($seq->keywords, 'Third Party Annotation; TPA');
591
ok($seq->desc, 'TPA: Mus musculus pantothenate kinase 4 mRNA, partial cds.');
592
ok($seq->seq_version, 1);
593
ok($seq->feature_count, 2);
594
$spec_obj = $seq->species;
595
ok ($spec_obj->common_name, 'Mus musculus (house mouse)');
596
ok ($spec_obj->species, 'musculus');
597
ok ($spec_obj->genus, 'Mus');
598
ok ($spec_obj->binomial, 'Mus musculus');
599
$ac = $seq->annotation;
600
$reference = ($ac->get_Annotations('reference') )[0];
601
ok ($reference->pubmed, '11479594');
602
ok ($reference->medline, '21372465');
608
$str = new Bio::SeqIO(-verbose => $verbosity,
609
-file => Bio::Root::IO->catfile
610
(qw(t data D12555.gbk)));
612
$seq = $str->next_seq;
65
map { unlink Bio::Root::IO->catfile("t","data","$_.out") } @formats