2
## Bioperl Test Harness Script for Modules
3
## $Id: swiss.t,v 1.3.4.4 2006/11/09 00:40:53 cjfields Exp $
5
# Before `make install' is performed this script should be runnable with
6
# `make test'. After `make install' it should work as `perl test.t'
13
# to handle systems with no installed Test module
14
# we include the t dir (where a copy of Test.pm is located)
16
eval { require Test::More; };
30
unlink(qw (swiss_unk.dat test.swiss));
34
use_ok('Bio::Root::IO');
35
my $verbose = $ENV{'BIOPERLDEBUG'};
37
my $seqio = new Bio::SeqIO( -verbose => $verbose,
39
-file => Bio::Root::IO->catfile('t','data',
42
isa_ok($seqio, 'Bio::SeqIO');
43
my $seq = $seqio->next_seq;
44
my @gns = $seq->annotation->get_Annotations('gene_name');
46
$seqio = new Bio::SeqIO( -verbose => $verbose,
48
-file => Bio::Root::IO->catfile
51
$seqio->write_seq($seq);
53
# reads it in once again
54
$seqio = new Bio::SeqIO( -verbose => $verbose,
56
-file => Bio::Root::IO->catfile('test.swiss'));
58
$seq = $seqio->next_seq;
59
isa_ok($seq->species, 'Bio::Taxon');
60
is($seq->species->ncbi_taxid, 6239);
62
# version, seq_update, dates (5 tests)
63
is($seq->version, 40);
64
my ($ann) = $seq->get_Annotations('seq_update');
66
my @dates = $seq->get_dates;
67
my @date_check = qw(01-NOV-1997 01-NOV-1997 16-OCT-2001);
68
for my $date (@dates) {
69
is($date, shift @date_check);
72
my @gns2 = $seq->annotation->get_Annotations('gene_name');
73
# check gene name is preserved (was losing suffix in worm gene names)
74
ok($#gns2 == 0 && $gns[0]->value eq $gns2[0]->value);
76
# test swissprot multiple RP lines
77
my $str = Bio::SeqIO->new(-file => Bio::Root::IO->catfile
78
(qw(t data P33897) ));
79
$seq = $str->next_seq;
80
isa_ok($seq, 'Bio::Seq::RichSeqI');
81
my @refs = $seq->annotation->get_Annotations('reference');
83
is($refs[20]->rp, 'VARIANTS X-ALD LEU-98; ASP-99; GLU-217; GLN-518; ASP-608; ILE-633 AND PRO-660, AND VARIANT THR-13.');
85
# version, seq_update, dates (5 tests)
86
is($seq->version, 44);
87
($ann) = $seq->get_Annotations('seq_update');
89
@dates = $seq->get_dates;
90
@date_check = qw(01-FEB-1994 01-FEB-1994 15-JUN-2004);
91
for my $date (@dates) {
92
is($date, shift @date_check);
95
my $ast = Bio::SeqIO->new(-verbose => $verbose,
97
-file => Bio::Root::IO->catfile("t","data","roa1.swiss"));
98
my $as = $ast->next_seq();
101
is($as->id, 'ROA1_HUMAN', "id is ".$as->id);
102
like($as->primary_id, qr(Bio::PrimarySeq));
103
is($as->length, 371);
104
is($as->alphabet, 'protein');
105
is($as->division, 'HUMAN');
106
is(scalar $as->all_SeqFeatures(), 16);
107
is(scalar $as->annotation->get_Annotations('reference'), 11);
109
# version, seq_update, dates (5 tests)
110
is($as->version, 35);
111
($ann) = $as->get_Annotations('seq_update');
113
@dates = $as->get_dates;
114
@date_check = qw(01-MAR-1989 01-AUG-1990 01-NOV-1997);
115
for my $date (@dates) {
116
is($date, shift @date_check);
119
my ($ent,$out) = undef;
122
$seqio = Bio::SeqIO->new(-format => 'swiss' ,
123
-verbose => $verbose,
124
-file => Bio::Root::IO->catfile
125
("t","data","swiss.dat"));
126
$seq = $seqio->next_seq;
127
isa_ok($seq, 'Bio::Seq::RichSeqI');
129
# more tests to verify we are actually parsing correctly
130
like($seq->primary_id, qr(Bio::PrimarySeq));
131
is($seq->display_id, 'MA32_HUMAN');
132
is($seq->length, 282);
133
is($seq->division, 'HUMAN');
134
is($seq->alphabet, 'protein');
135
my @f = $seq->all_SeqFeatures();
137
is($f[1]->primary_tag, 'CHAIN');
138
is(($f[1]->get_tag_values('description'))[0], 'COMPLEMENT COMPONENT 1, Q SUBCOMPONENT BINDING PROTEIN');
140
# version, seq_update, dates (5 tests)
141
is($seq->version, 40);
142
($ann) = $seq->get_Annotations('seq_update');
144
@dates = $seq->get_dates;
145
@date_check = qw(01-FEB-1995 01-FEB-1995 01-OCT-2000);
146
for my $date (@dates) {
147
is($date, shift @date_check);
150
my @genenames = qw(GC1QBP HABP1 SF2P32 C1QBP);
151
($ann) = $seq->annotation->get_Annotations('gene_name');
152
foreach my $gn ( $ann->get_all_values() ) {
153
ok ($gn, shift(@genenames));
155
ok($ann->value(-joins => [" AND "," OR "]), "GC1QBP OR HABP1 OR SF2P32 OR C1QBP");
157
# test for feature locations like ?..N
158
$seq = $seqio->next_seq();
159
isa_ok($seq, 'Bio::Seq::RichSeqI');
160
like($seq->primary_id, qr(Bio::PrimarySeq));
161
is($seq->display_id, 'ACON_CAEEL');
162
is($seq->length, 788);
163
is($seq->division, 'CAEEL');
164
is($seq->alphabet, 'protein');
165
is(scalar $seq->all_SeqFeatures(), 5);
167
foreach my $gn ( $seq->annotation->get_Annotations('gene_name') ) {
168
ok ($gn->value, 'F54H12.1');
171
# test species in swissprot -- this can be a n:n nightmare
172
$seq = $seqio->next_seq();
173
isa_ok($seq, 'Bio::Seq::RichSeqI');
174
like($seq->primary_id, qr(Bio::PrimarySeq));
175
my @sec_acc = $seq->get_secondary_accessions();
176
is($sec_acc[0], 'P29360');
177
is($sec_acc[1], 'Q63631');
178
is($seq->accession_number, 'P42655');
179
my @kw = $seq->get_keywords;
180
is( $kw[0], 'Brain');
181
is( $kw[1], 'Neurone');
182
is($kw[3], 'Multigene family');
183
is($seq->display_id, '143E_HUMAN');
184
is($seq->species->binomial, "Homo sapiens");
185
is($seq->species->common_name, "Human");
186
is($seq->species->ncbi_taxid, 9606);
188
$seq = $seqio->next_seq();
189
isa_ok($seq, 'Bio::Seq::RichSeqI');
190
like($seq->primary_id, qr(Bio::PrimarySeq));
191
is($seq->species->binomial, "Bos taurus");
192
is($seq->species->common_name, "Bovine");
193
is($seq->species->ncbi_taxid, 9913);
195
# multiple genes in swissprot
196
$seq = $seqio->next_seq();
197
isa_ok($seq, 'Bio::Seq::RichSeqI');
198
like($seq->primary_id, qr(Bio::PrimarySeq));
200
($ann) = $seq->annotation->get_Annotations("gene_name");
201
@genenames = qw(CALM1 CAM1 CALM CAM CALM2 CAM2 CAMB CALM3 CAM3 CAMC);
202
my $flatnames = "(CALM1 OR CAM1 OR CALM OR CAM) AND (CALM2 OR CAM2 OR CAMB) AND (CALM3 OR CAM3 OR CAMC)";
204
my @names = @genenames; # copy array
205
my @ann_names = $ann->get_all_values();
207
is(scalar(@ann_names), scalar(@names));
208
foreach my $gn (@ann_names) {
209
is($gn, shift(@names));
211
is($ann->value(-joins => [" AND "," OR "]), $flatnames);
213
# same entry as before, but with the new gene names format
214
$seqio = Bio::SeqIO->new(-format => 'swiss',
215
-verbose => $verbose,
216
-file => Bio::Root::IO->catfile
217
("t","data","calm.swiss"));
218
$seq = $seqio->next_seq();
219
isa_ok($seq, 'Bio::Seq::RichSeqI');
220
like($seq->primary_id, qr(Bio::PrimarySeq));
221
($ann) = $seq->annotation->get_Annotations("gene_name");
222
my @ann_names2 = $ann->get_all_values();
223
@names = @genenames; # copy array
224
is(scalar(@ann_names2), scalar(@names));
225
foreach my $gn (@ann_names2) {
226
is($gn, shift(@names));
228
is($ann->value(-joins => [" AND "," OR "]), $flatnames);
230
# test proper parsing of references
231
my @litrefs = $seq->annotation->get_Annotations('reference');
232
is(scalar(@litrefs), 17);
235
'"Complete amino acid sequence of human brain calmodulin."',
236
'"Multiple divergent mRNAs code for a single human calmodulin."',
237
'"Molecular analysis of human and rat calmodulin complementary DNA clones. Evidence for additional active genes in these species."',
238
'"Isolation and nucleotide sequence of a cDNA encoding human calmodulin."',
239
'"Structure of the human CALM1 calmodulin gene and identification of two CALM1-related pseudogenes CALM1P1 and CALM1P2."',
241
'"Characterization of the human CALM2 calmodulin gene and comparison of the transcriptional activity of CALM1, CALM2 and CALM3."',
242
'"Cloning of human full-length CDSs in BD Creator(TM) system donor vector."',
243
'"The DNA sequence and analysis of human chromosome 14."',
244
'"Generation and initial analysis of more than 15,000 full-length human and mouse cDNA sequences."',
245
'"Alpha-helix nucleation by a calcium-binding peptide loop."',
246
'"Solution structure of Ca(2+)-calmodulin reveals flexible hand-like properties of its domains."',
247
'"Calmodulin structure refined at 1.7 A resolution."',
248
'"Drug binding by calmodulin: crystal structure of a calmodulin-trifluoperazine complex."',
249
'"Structural basis for the activation of anthrax adenylyl cyclase exotoxin by calmodulin."',
250
'"Physiological calcium concentrations regulate calmodulin binding and catalysis of adenylyl cyclase exotoxins."',
251
'"Crystal structure of a MARCKS peptide containing the calmodulin-binding domain in complex with Ca2+-calmodulin."',
255
"Biochemistry 21:2565-2569(1982).",
256
"J. Biol. Chem. 263:17055-17062(1988).",
257
"J. Biol. Chem. 262:16663-16670(1987).",
258
"Biochem. Int. 9:177-185(1984).",
259
"Eur. J. Biochem. 225:71-82(1994).",
260
"Submitted (FEB-1995) to the EMBL/GenBank/DDBJ databases.",
261
"Cell Calcium 23:323-338(1998).",
262
"Submitted (MAY-2003) to the EMBL/GenBank/DDBJ databases.",
263
"Nature 421:601-607(2003).",
264
"Proc. Natl. Acad. Sci. U.S.A. 99:16899-16903(2002).",
265
"Proc. Natl. Acad. Sci. U.S.A. 96:903-908(1999).",
266
"Nat. Struct. Biol. 8:990-997(2001).",
267
"J. Mol. Biol. 228:1177-1192(1992).",
268
"Biochemistry 33:15259-15265(1994).",
269
"Nature 415:396-402(2002).",
270
"EMBO J. 21:6721-6732(2002).",
271
"Nat. Struct. Biol. 10:226-231(2003).",
294
foreach my $litref (@litrefs) {
295
is($litref->title, shift(@titles));
296
is($litref->location, shift(@locs));
297
is($litref->start, shift(@positions));
298
is($litref->end, shift(@positions));
301
# format parsing changes (pre-rel 9.0)
303
$seqio = new Bio::SeqIO( -verbose => $verbose,
305
-file => Bio::Root::IO->catfile('t','data',
309
$seq = $seqio->next_seq;
310
isa_ok($seq->species, 'Bio::Taxon');
311
is($seq->species->ncbi_taxid, "6239");
313
# version, seq_update, dates (5 tests)
314
is($seq->version, 44);
315
($ann) = $seq->get_Annotations('seq_update');
317
@dates = $seq->get_dates;
318
@date_check = qw(01-NOV-1997 01-NOV-1996 30-MAY-2006 );
319
for my $date (@dates) {
320
is($date, shift @date_check);
323
my @idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 WBGene00010052
324
F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090
325
IPR006092 IPR009075 IPR009100 IPR013764 PF00441
326
PF02770 PF02771 PS00072 PS00073);
328
for my $dblink ( $seq->annotation->get_Annotations('dblink') ) {
329
is($dblink->primary_id, shift @idcheck);
332
$seqio = new Bio::SeqIO( -verbose => $verbose,
334
-file => Bio::Root::IO->catfile('t','data',
337
my @namespaces = qw(Swiss-Prot TrEMBL TrEMBL);
339
while (my $seq = $seqio->next_seq) {
340
is($seq->namespace, shift @namespaces);
343
# format parsing changes (rel 9.0, Oct 2006)
345
$seqio = new Bio::SeqIO( -verbose => $verbose,
347
-file => Bio::Root::IO->catfile('t','data',
351
$seq = $seqio->next_seq;
352
isa_ok($seq->species, 'Bio::Taxon');
353
is($seq->species->ncbi_taxid, 6239);
355
is($seq->version, 47);
356
($ann) = $seq->get_Annotations('seq_update');
358
@dates = $seq->get_dates;
359
@date_check = qw(01-NOV-1997 01-NOV-1996 31-OCT-2006 );
360
for my $date (@dates) {
361
is($date, shift @date_check);
364
@idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 cel:F54D5.7
365
WBGene00010052 F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090
366
IPR006092 IPR009075 IPR013786 IPR009100 IPR013764 PF00441 PF02770
367
PF02771 PS00072 PS00073 );
369
for my $dblink ( $seq->annotation->get_Annotations('dblink') ) {
370
is($dblink->primary_id, shift @idcheck);
373
$seqio = new Bio::SeqIO( -verbose => $verbose,
375
-file => Bio::Root::IO->catfile('t','data',
378
@namespaces = qw(Swiss-Prot TrEMBL TrEMBL);
380
while (my $seq = $seqio->next_seq) {
381
is($seq->namespace, shift @namespaces);