1
# -*-Perl-*- Test Harness script for Bioperl
2
# $Id: SearchIO_blast_pull.t 11525 2007-06-27 10:16:38Z sendu $
10
test_begin(-tests => 289);
12
use_ok('Bio::SearchIO');
16
my $searchio = Bio::SearchIO->new(-format => 'blast_pull', -file => test_input_file('new_blastn.txt'));
18
my $result = $searchio->next_result;
19
is $result->database_name, 'All GenBank+EMBL+DDBJ+PDB sequences (but no EST, STS,GSS,environmental samples or phase 0, 1 or 2 HTGS sequences)';
20
is $result->database_entries, 3742891;
21
is $result->database_letters, 16670205594;
22
is $result->algorithm, 'BLASTN';
23
is $result->algorithm_version, '2.2.13 [Nov-27-2005]';
24
is $result->query_name, 'pyrR,';
25
is $result->query_length, 558;
26
is $result->get_statistic('kappa'), 0.711;
27
is $result->get_statistic('kappa_gapped'), 0.711;
28
is $result->get_statistic('lambda'), 1.37;
29
is $result->get_statistic('lambda_gapped'), 1.37;
30
is $result->get_statistic('entropy'), 1.31;
31
is $result->get_statistic('entropy_gapped'), 1.31;
32
is $result->get_statistic('dbletters'), -509663586;
33
is $result->get_statistic('dbentries'), 3742891;
34
is $result->get_statistic('effectivespace'), 8935230198384;
35
is $result->get_parameter('matrix'), 'blastn matrix:1 -3';
36
is $result->get_parameter('gapopen'), 5;
37
is $result->get_parameter('gapext'), 2;
38
is $result->get_statistic('S2'), 60;
39
is $result->get_statistic('S2_bits'), 119.4;
40
float_is $result->get_parameter('expect'), 1e-23;
41
is $result->get_statistic('num_extensions'), 117843;
43
my @valid = ( [ 'gi|41400296|gb|AE016958.1|', 4829781, 'AE016958', '6e-059', 236],
44
[ 'gi|54013472|dbj|AP006618.1|', 6021225, 'AP006618', '4e-026', 127],
45
[ 'gi|57546753|dbj|BA000030.2|', 9025608, 'BA000030', '1e-023', 119]);
47
while (my $hit = $result->next_hit ) {
50
is $hit->name, shift @$d;
51
is $hit->length, shift @$d;
52
is $hit->accession, shift @$d;
53
float_is($hit->significance, shift @$d);
54
is $hit->raw_score, shift @$d;
58
while( my $hsp = $hit->next_hsp ) {
59
is $hsp->query->start, 262;
60
is $hsp->query->end, 552;
61
is $hsp->hit->start, 1166897;
62
is $hsp->hit->end, 1167187;
63
is $hsp->length('hsp'), 291;
64
is $hsp->start('hit'), $hsp->hit->start;
65
is $hsp->end('query'), $hsp->query->end;
66
is $hsp->strand('sbjct'), $hsp->subject->strand;# alias for hit
67
float_is($hsp->evalue, 6e-59);
70
is sprintf("%.1f",$hsp->percent_identity), 85.2;
71
is sprintf("%.3f",$hsp->frac_identical('query')), 0.852;
72
is sprintf("%.3f",$hsp->frac_identical('hit')), 0.852;
78
last if( $count++ > @valid );
83
$searchio = Bio::SearchIO->new('-format' => 'blast_pull',
84
'-file' => test_input_file('blast_no_hit_desc.txt'));
86
$result = $searchio->next_result;
87
my $hit = $result->next_hit;
88
is $hit->name, 'chr1';
89
is $hit->description, '';
91
# further (NCBI blastn/p) tests taken from SearchIO.t
92
$searchio = Bio::SearchIO->new('-format' => 'blast_pull',
93
'-file' => test_input_file('ecolitst.bls'));
95
$result = $searchio->next_result;
97
is($result->database_name, 'ecoli.aa', 'database_name()');
98
is($result->database_entries, 4289);
99
is($result->database_letters, 1358990);
101
is($result->algorithm, 'BLASTP');
102
like($result->algorithm_version, qr/^2\.1\.3/);
103
like($result->query_name, qr/gi|1786183|gb|AAC73113.1| (AE000111) aspartokinase I,\s+homoserine dehydrogenase I [Escherichia coli]/);
104
is($result->query_length, 820);
105
is($result->get_statistic('kappa'), '0.135');
106
is($result->get_statistic('kappa_gapped'), '0.0410');
107
is($result->get_statistic('lambda'), '0.319');
108
is($result->get_statistic('lambda_gapped'), '0.267');
109
is($result->get_statistic('entropy'), '0.383');
110
is($result->get_statistic('entropy_gapped'), '0.140');
112
is($result->get_statistic('dbletters'), 1358990);
113
is($result->get_statistic('dbentries'), 4289);
114
is($result->get_statistic('effective_hsplength'), 47);
115
is($result->get_statistic('effectivespace'), 894675611);
116
is($result->get_parameter('matrix'), 'BLOSUM62');
117
is($result->get_parameter('gapopen'), 11);
118
is($result->get_parameter('gapext'), 1);
119
is($result->get_statistic('S2'), '92');
120
is($result->get_statistic('S2_bits'), '40.0');
121
float_is($result->get_parameter('expect'), '1.0e-03');
122
is($result->get_statistic('num_extensions'), '82424');
125
@valid = ( [ 'gb|AAC73113.1|', 820, 'AAC73113', '0', 1567],
126
[ 'gb|AAC76922.1|', 810, 'AAC76922', '1e-91', 332],
127
[ 'gb|AAC76994.1|', 449, 'AAC76994', '3e-47', 184]);
129
while (my $hit = $result->next_hit) {
130
my $d = shift @valid;
132
is($hit->name, shift @$d);
133
is($hit->length, shift @$d);
134
is($hit->accession, shift @$d);
135
float_is($hit->significance, shift @$d);
136
is($hit->raw_score, shift @$d );
140
while (my $hsp = $hit->next_hsp) {
141
is($hsp->query->start, 1);
142
is($hsp->query->end, 820);
143
is($hsp->hit->start, 1);
144
is($hsp->hit->end, 820);
145
is($hsp->length('hsp'), 820);
146
is($hsp->start('hit'), $hsp->hit->start);
147
is($hsp->end('query'), $hsp->query->end);
148
is($hsp->strand('sbjct'), $hsp->subject->strand);# alias for hit
149
float_is($hsp->evalue, 0.0);
150
is($hsp->score, 4058);
152
is(sprintf("%.2f",$hsp->percent_identity), 98.29);
153
is(sprintf("%.4f",$hsp->frac_identical('query')), 0.9829);
154
is(sprintf("%.4f",$hsp->frac_identical('hit')), 0.9829);
160
last if( $count++ > @valid );
164
$searchio = Bio::SearchIO->new('-format' => 'blast_pull',
165
'-file' => test_input_file('a_thaliana.blastn'));
167
$result = $searchio->next_result;
168
is($result->database_name, 'All GenBank+EMBL+DDBJ+PDB sequences (but no EST, STS, GSS,or phase 0, 1 or 2 HTGS sequences)');
169
is($result->database_letters, 4677375331);
170
is($result->database_entries, 1083200);
171
is($result->algorithm, 'BLASTN');
172
like($result->algorithm_version, qr/^2\.2\.1/);
173
is($result->query_name, '');
174
is($result->query_length, 60);
175
is($result->get_parameter('gapopen'), 5);
176
is($result->get_parameter('gapext'), 2);
177
is($result->get_parameter('ktup'), undef);
179
is($result->get_statistic('lambda'), 1.37);
180
is($result->get_statistic('kappa'), 0.711);
181
is($result->get_statistic('entropy'),1.31 );
182
is($result->get_statistic('T'), 0);
183
is($result->get_statistic('A'), 30);
184
is($result->get_statistic('X1'), '6');
185
is($result->get_statistic('X1_bits'), 11.9);
186
is($result->get_statistic('X2'), 15);
187
is($result->get_statistic('X2_bits'), 29.7);
188
is($result->get_statistic('S1'), 12);
189
is($result->get_statistic('S1_bits'), 24.3);
190
is($result->get_statistic('S2'), 17);
191
is($result->get_statistic('S2_bits'), 34.2);
193
is($result->get_statistic('dbentries'), 1083200);
195
@valid = ( [ 'gb|AY052359.1|', 2826, 'AY052359', '3e-18', 96, 1, 60,
197
[ 'gb|AC002329.2|AC002329', 76170, 'AC002329', '3e-18', 96, 1, 60,
199
[ 'gb|AF132318.1|AF132318', 5383, 'AF132318', '0.04', 42, 35, 55,
203
while( my $hit = $result->next_hit ) {
204
my $d = shift @valid;
205
is($hit->name, shift @$d);
206
is($hit->length, shift @$d);
207
is($hit->accession, shift @$d);
208
float_is($hit->significance, shift @$d);
209
is($hit->raw_score, shift @$d );
210
is($hit->start, shift @$d);
211
is($hit->end,shift @$d);
212
is(sprintf("%.4f",$hit->frac_aligned_query), shift @$d);
215
while( my $hsp = $hit->next_hsp ) {
216
is($hsp->query->start, 1);
217
is($hsp->query->end, 60);
218
is($hsp->query->strand, 1);
219
is($hsp->hit->start, 154);
220
is($hsp->hit->end, 212);
221
is($hsp->hit->strand, 1);
222
is($hsp->length('hsp'), 60);
223
float_is($hsp->evalue, 3e-18);
226
is(sprintf("%.2f",$hsp->percent_identity), 96.67);
227
is(sprintf("%.4f",$hsp->frac_identical('query')), 0.9667);
228
is(sprintf("%.4f",$hsp->frac_identical('hit')), 0.9831);
229
is($hsp->query->frame(), 0);
230
is($hsp->hit->frame(), 0);
231
is($hsp->query->seq_id, undef);
232
is($hsp->hit->seq_id, 'gb|AY052359.1|');
233
is($hsp->gaps('query'), 0);
234
is($hsp->gaps('hit'), 1);
236
is($hsp->query_string, 'aggaatgctgtttaattggaatcgtacaatggagaatttgacggaaatagaatcaacgat');
237
is($hsp->hit_string, 'aggaatgctgtttaattggaatca-acaatggagaatttgacggaaatagaatcaacgat');
238
is($hsp->homology_string, '||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||');
239
is(sprintf("%.2f",$hsp->get_aln->overall_percentage_identity), 96.67);
240
is(sprintf("%.2f",$hsp->get_aln->percentage_identity), 98.31);
245
last if( $count++ > @valid );
249
$searchio = Bio::SearchIO->new('-format' => 'blast_pull',
250
'-file' => test_input_file('frac_problems.blast'));
251
my @expected = ("1.000", "0.943");
252
while (my $result = $searchio->next_result) {
253
my $hit = $result->next_hit;
254
if (@expected == 2) {
255
is($hit->frac_identical, shift @expected);
259
local $TODO = 'frac_identical failing!';
260
is($hit->frac_identical, shift @expected);
266
# And even more: frac_aligned_query should never be over 1!
267
$searchio = Bio::SearchIO->new('-format' => 'blast_pull',
268
'-file' => test_input_file('frac_problems2.blast'));
269
$result = $searchio->next_result;
270
$hit = $result->next_hit;
271
is $hit->frac_aligned_query, 0.97;
273
# Also, start and end should be sane
274
$searchio = Bio::SearchIO->new('-format' => 'blast_pull',
275
'-file' => test_input_file('frac_problems3.blast'));
276
$result = $searchio->next_result;
277
$hit = $result->next_hit;
278
is $hit->start('sbjct'), 207;
279
is $hit->end('sbjct'), 1051;
281
# Do a multiblast report test
282
$searchio = Bio::SearchIO->new('-format' => 'blast_pull',
283
'-file' => test_input_file('multi_blast.bls'));
285
@expected = qw(CATH_RAT CATL_HUMAN CATL_RAT PAPA_CARPA);
286
my $results_left = 4;
287
while( my $result = $searchio->next_result ) {
288
is($result->query_name, shift @expected, "Multiblast query test");
291
is($results_left, 0);
293
# Web blast result parsing
294
$searchio = Bio::SearchIO->new(-format => 'blast_pull',
295
-file => test_input_file('catalase-webblast.BLASTP'));
296
ok($result = $searchio->next_result);
297
ok($hit = $result->next_hit);
298
is($hit->name, 'gi|40747822|gb|EAA66978.1|', 'full hit name');
299
is($hit->accession, 'EAA66978', 'hit accession');
300
ok(my $hsp = $hit->next_hsp);
301
is($hsp->query->start, 1, 'query start');
302
is($hsp->query->end, 528, 'query start');
304
# tests for new BLAST 2.2.13 output
305
$searchio = Bio::SearchIO->new(-format => 'blast_pull',
306
-file => test_input_file('new_blastn.txt'));
308
$result = $searchio->next_result;
309
is($result->database_name, 'All GenBank+EMBL+DDBJ+PDB sequences (but no EST, STS,GSS,environmental samples or phase 0, 1 or 2 HTGS sequences)');
310
is($result->database_entries, 3742891);
311
is($result->database_letters, 16670205594);
312
is($result->algorithm, 'BLASTN');
313
is($result->algorithm_version, '2.2.13 [Nov-27-2005]');
314
is($result->query_name, 'pyrR,');
315
is($result->query_length, 558);
316
is($result->get_statistic('kappa'), '0.711');
317
is($result->get_statistic('kappa_gapped'), '0.711');
318
is($result->get_statistic('lambda'), '1.37');
319
is($result->get_statistic('lambda_gapped'), '1.37');
320
is($result->get_statistic('entropy'), '1.31');
321
is($result->get_statistic('entropy_gapped'), '1.31');
322
is($result->get_statistic('dbletters'), '-509663586');
323
is($result->get_statistic('dbentries'), 3742891);
324
is($result->get_statistic('effective_hsplength'), undef);
325
is($result->get_statistic('effectivespace'), 8935230198384);
326
is($result->get_parameter('matrix'), 'blastn matrix:1 -3');
327
is($result->get_parameter('gapopen'), 5);
328
is($result->get_parameter('gapext'), 2);
329
is($result->get_statistic('S2'), '60');
330
is($result->get_statistic('S2_bits'), '119.4');
331
float_is($result->get_parameter('expect'), '1e-23');
332
is($result->get_statistic('num_extensions'), '117843');
334
@valid = ( [ 'gi|41400296|gb|AE016958.1|', 4829781, 'AE016958', '6e-059', 236],
335
[ 'gi|54013472|dbj|AP006618.1|', 6021225, 'AP006618', '4e-026', 127],
336
[ 'gi|57546753|dbj|BA000030.2|', 9025608, 'BA000030', '1e-023', 119]);
338
while( $hit = $result->next_hit ) {
339
my $d = shift @valid;
341
is($hit->name, shift @$d);
342
is($hit->length, shift @$d);
343
is($hit->accession, shift @$d);
344
float_is($hit->significance, shift @$d);
345
is($hit->raw_score, shift @$d );
349
while( my $hsp = $hit->next_hsp ) {
350
is($hsp->query->start, 262);
351
is($hsp->query->end, 552);
352
is($hsp->hit->start, 1166897);
353
is($hsp->hit->end, 1167187);
354
is($hsp->length('hsp'), 291);
355
is($hsp->start('hit'), $hsp->hit->start);
356
is($hsp->end('query'), $hsp->query->end);
357
is($hsp->strand('sbjct'), $hsp->subject->strand);# alias for hit
358
float_is($hsp->evalue, 6e-59);
359
is($hsp->score, 119);
361
is(sprintf("%.2f",$hsp->percent_identity), 85.22);
362
is(sprintf("%.4f",$hsp->frac_identical('query')), 0.8522);
363
is(sprintf("%.4f",$hsp->frac_identical('hit')), 0.8522);
369
last if( $count++ > @valid );
374
$searchio = Bio::SearchIO->new(-format => 'blast_pull',
375
-file => test_input_file('blastp2215.blast'));
377
$result = $searchio->next_result;
378
is($result->database_entries, 4460989);
379
is($result->database_letters, 1533424333);
380
is($result->algorithm, 'BLASTP');
381
is($result->algorithm_version, '2.2.15 [Oct-15-2006]');
382
is($result->query_name, 'gi|15608519|ref|NP_215895.1|');
383
is($result->query_length, 193);
384
my @hits = $result->hits;
385
is(scalar(@hits), 10);
386
is($hits[1]->accession,'1W30');
387
float_is($hits[4]->significance,'2e-72');
388
is($hits[7]->score,'254');
389
$result = $searchio->next_result;
390
is($result->database_entries, 4460989);
391
is($result->database_letters, 1533424333);
392
is($result->algorithm, 'BLASTP');
393
is($result->algorithm_version, '2.2.15 [Oct-15-2006]');
394
is($result->query_name, 'gi|15595598|ref|NP_249092.1|');
395
is($result->query_length, 423);
396
@hits = $result->hits;
397
is(scalar(@hits), 10);
398
is($hits[1]->accession,'ZP_00972546');
399
float_is($hits[4]->significance, '0.0');
400
is($hits[7]->score, 624);
403
$searchio = Bio::SearchIO->new(-format => 'blast_pull',
405
-file => test_input_file('bug2246.blast'));
406
$result = $searchio->next_result;
407
$hit = $result->next_hit;
408
is $hit->name, 'UniRef50_Q9X0H5';
409
is $hit->length, undef;
410
is $hit->accession, 'UniRef50_Q9X0H5';
411
is $hit->description, 'Cluster: Histidyl-tRNA synthetase; n=4; Thermoto...';
412
is $hit->raw_score, 23;
413
float_is($hit->significance, 650);
415
#*** need to /fully/ test a multi-result, multi-hit, multi-hsp report!