541
693
ok($hsp->query_string, 'aggaatgctgtttaattggaatcgtacaatggagaatttgacggaaatagaatcaacgat');
542
694
ok($hsp->hit_string, 'aggaatgctgtttaattggaatca-acaatggagaatttgacggaaatagaatcaacgat');
543
695
ok($hsp->homology_string, '||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||');
546
last if( $count++ > @valid );
549
# TODO: Flesh this test out!
550
$searchio = new Bio::SearchIO ('-format' => 'psiblast',
696
ok(sprintf("%.2f",$hsp->get_aln->overall_percentage_identity), 96.67);
697
ok(sprintf("%.2f",$hsp->get_aln->percentage_identity), 98.31);
700
last if( $count++ > @valid );
705
$searchio = new Bio::SearchIO('-format' => 'blast',
706
'-file' => Bio::Root::IO->catfile('t','data','dnaEbsub_ecoli.wublastx'));
708
$result = $searchio->next_result;
709
ok($result->database_name, 'ecoli.aa');
710
ok($result->database_letters, 1358990);
711
ok($result->database_entries, 4289);
712
ok($result->algorithm, 'BLASTX');
713
ok($result->algorithm_version, qr/^2\.0MP\-WashU/);
714
ok($result->query_name, 'gi|142864|gb|M10040.1|BACDNAE');
715
ok($result->query_description, 'B.subtilis dnaE gene encoding DNA primase, complete cds');
716
ok($result->query_accession, 'M10040.1');
717
ok($result->query_length, 2001);
718
ok($result->get_parameter('matrix'), 'blosum62');
720
ok($result->get_statistic('lambda'), 0.318);
721
ok($result->get_statistic('kappa'), 0.135);
722
ok($result->get_statistic('entropy'),0.401 );
724
ok($result->get_statistic('dbentries'), 4289);
726
@valid = ( [ 'gi|1789447|gb|AAC76102.1|', 581, 'AAC76102', '1.1e-74', 671]);
729
while( my $hit = $result->next_hit ) {
730
my $d = shift @valid;
731
ok($hit->name, shift @$d);
732
ok($hit->length, shift @$d);
733
ok($hit->accession, shift @$d);
734
ok(sprintf("%g",$hit->significance), sprintf("%g",shift @$d) );
735
ok($hit->raw_score, shift @$d );
738
while( my $hsp = $hit->next_hsp ) {
739
ok($hsp->query->start, 21);
740
ok($hsp->query->end, 1265);
741
ok($hsp->query->strand, 1);
742
ok($hsp->hit->start, 1);
743
ok($hsp->hit->end, 413);
744
ok($hsp->hit->strand, 0);
745
ok($hsp->length('hsp'), 421);
746
ok(sprintf("%g",$hsp->evalue), sprintf("%g",'1.1e-74'));
747
ok(sprintf("%g",$hsp->pvalue), sprintf("%g",'1.1e-74'));
749
ok($hsp->bits,265.8);
750
ok(sprintf("%.2f",$hsp->percent_identity), 35.87);
751
ok(sprintf("%.4f",$hsp->frac_identical('query')), 0.1213);
752
ok(sprintf("%.4f",$hsp->frac_identical('hit')), 0.3656);
753
ok(sprintf("%.4f",$hsp->frac_conserved('hsp')), 0.5297);
754
ok($hsp->query->frame(), 2);
755
ok($hsp->hit->frame(), 0);
756
ok($hsp->gaps('query'), 6);
757
ok($hsp->gaps('hit'), 8);
759
ok($hsp->query_string, 'MGNRIPDEIVDQVQKSADIVEVIGDYVQLKKQGRNYFGLCPFHGESTPSFSVSPDKQIFHCFGCGAGGNVFSFLRQMEGYSFAESVSHLADKYQIDFPDDITVHSGARP---ESSGEQKMAEAHELLKKFYHHLLINTKEGQEALDYLLSRGFTKELINEFQIGYALDSWDFITKFLVKRGFSEAQMEKAGLLIRREDGSGYFDRFRNRVMFPIHDHHGAVVAFSGRALGSQQPKYMNSPETPLFHKSKLLYNFYKARLHIRKQERAVLFEGFADVYTAVSSDVKESIATMGTSLTDDHVKILRRNVEEIILCYDSDKAGYEATLKASELL---QKKGCKVRVAMIPDGLDPDDYIKKFGGEKFKNDIIDASVTVMAFKMQYFRKGKNLSDEGDRLAYIKDVLKEISTLSGSLEQEVYVKQ');
760
ok($hsp->hit_string, 'MAGRIPRVFINDLLARTDIVDLIDARVKLKKQGKNFHACCPFHNEKTPSFTVNGEKQFYHCFGCGAHGNAIDFLMNYDKLEFVETVEELAAMHNLEVPFE----AGSGPSQIERHQRQTLYQLMDGLNTFYQQSL-QQPVATSARQYLEKRGLSHEVIARFAIGFAPPGWDNVLKRFGGNPENRQSLIDAGMLVTNDQGRSY-DRFRERVMFPIRDKRGRVIGFGGRVLGNDTPKYLNSPETDIFHKGRQLYGLYEAQQDNAEPNRLLVVEGYMDVVALAQYGINYAVASLGTSTTADHIQLLFRATNNVICCYDGDRAGRDAAWRALETALPYMTDGRQLRFMFLPDGEDPDTLVRKEGKEAFEARM-EQAMPLSAFLFNSLMPQVDLSTPDGRARLSTLALPLISQVPGETLR-IYLRQ');
761
ok($hsp->homology_string, 'M RIP ++ + DIV++I V+LKKQG+N+ CPFH E TPSF+V+ +KQ +HCFGCGA GN FL + F E+V LA + ++ P + +G+ P E Q + + + L FY L A YL RG + E+I F IG+A WD + K + + AG+L+ + G Y DRFR RVMFPI D G V+ F GR LG+ PKY+NSPET +FHK + LY Y+A+ + R ++ EG+ DV + ++A++GTS T DH+++L R +I CYD D+AG +A +A E G ++R +PDG DPD ++K G E F+ + + ++ + AF +LS R L IS + G + +Y++Q');
764
last if( $count++ > @valid );
769
$searchio = new Bio::SearchIO('-format' => 'blast',
770
'-file' => Bio::Root::IO->catfile('t','data','dnaEbsub_ecoli.wutblastn'));
772
$result = $searchio->next_result;
773
ok($result->database_name, 'ecoli.nt');
774
ok($result->database_letters, 4662239);
775
ok($result->database_entries, 400);
776
ok($result->algorithm, 'TBLASTN');
777
ok($result->algorithm_version, qr/^2\.0MP\-WashU/);
778
ok($result->query_name, 'gi|142865|gb|AAA22406.1|');
779
ok($result->query_description, 'DNA primase');
780
ok($result->query_accession, 'AAA22406.1');
781
ok($result->query_length, 603);
782
ok($result->get_parameter('matrix'), 'blosum62');
784
ok($result->get_statistic('lambda'), '0.320');
785
ok($result->get_statistic('kappa'), 0.136);
786
ok($result->get_statistic('entropy'),0.387 );
788
ok($result->get_statistic('dbentries'), 400);
790
@valid = ( [ 'gi|1789441|gb|AE000388.1|AE000388', 10334, 'AE000388', '1.4e-73', 671]);
793
while( my $hit = $result->next_hit ) {
794
my $d = shift @valid;
795
ok($hit->name, shift @$d);
796
ok($hit->length, shift @$d);
797
ok($hit->accession, shift @$d);
798
ok(sprintf("%g",$hit->significance), sprintf("%g",shift @$d) );
799
ok($hit->raw_score, shift @$d );
802
while( my $hsp = $hit->next_hsp ) {
803
ok($hsp->query->start, 1);
804
ok($hsp->query->end, 415);
805
ok($hsp->query->strand, 0);
806
ok($hsp->hit->start, 4778);
807
ok($hsp->hit->end, 6016);
808
ok($hsp->hit->strand, 1);
809
ok($hsp->length('hsp'), 421);
810
ok(sprintf("%g",$hsp->evalue), sprintf("%g",'1.4e-73'));
811
ok(sprintf("%g",$hsp->pvalue), sprintf("%g",'1.4e-73'));
813
ok($hsp->bits,265.8);
814
ok(sprintf("%.2f",$hsp->percent_identity), 35.87);
815
ok(sprintf("%.4f",$hsp->frac_identical('hit')), 0.1219);
816
ok(sprintf("%.4f",$hsp->frac_identical('query')), 0.3639);
817
ok(sprintf("%.4f",$hsp->frac_conserved('hsp')), 0.5297);
818
ok($hsp->query->frame(), 0);
819
ok($hsp->hit->frame(), 1);
820
ok($hsp->gaps('query'), 6);
821
ok($hsp->gaps('hit'), 8);
823
ok($hsp->query_string, 'MGNRIPDEIVDQVQKSADIVEVIGDYVQLKKQGRNYFGLCPFHGESTPSFSVSPDKQIFHCFGCGAGGNVFSFLRQMEGYSFAESVSHLADKYQIDFPDDITVHSGARP---ESSGEQKMAEAHELLKKFYHHLLINTKEGQEALDYLLSRGFTKELINEFQIGYALDSWDFITKFLVKRGFSEAQMEKAGLLIRREDGSGYFDRFRNRVMFPIHDHHGAVVAFSGRALGSQQPKYMNSPETPLFHKSKLLYNFYKARLHIRKQERAVLFEGFADVYTAVSSDVKESIATMGTSLTDDHVKILRRNVEEIILCYDSDKAGYEATLKASELL---QKKGCKVRVAMIPDGLDPDDYIKKFGGEKFKNDIIDASVTVMAFKMQYFRKGKNLSDEGDRLAYIKDVLKEISTLSGSLEQEVYVKQ');
824
ok($hsp->hit_string, 'MAGRIPRVFINDLLARTDIVDLIDARVKLKKQGKNFHACCPFHNEKTPSFTVNGEKQFYHCFGCGAHGNAIDFLMNYDKLEFVETVEELAAMHNLEVPFE----AGSGPSQIERHQRQTLYQLMDGLNTFYQQSL-QQPVATSARQYLEKRGLSHEVIARFAIGFAPPGWDNVLKRFGGNPENRQSLIDAGMLVTNDQGRSY-DRFRERVMFPIRDKRGRVIGFGGRVLGNDTPKYLNSPETDIFHKGRQLYGLYEAQQDNAEPNRLLVVEGYMDVVALAQYGINYAVASLGTSTTADHIQLLFRATNNVICCYDGDRAGRDAAWRALETALPYMTDGRQLRFMFLPDGEDPDTLVRKEGKEAFEARM-EQAMPLSAFLFNSLMPQVDLSTPDGRARLSTLALPLISQVPGETLR-IYLRQ');
825
ok($hsp->homology_string, 'M RIP ++ + DIV++I V+LKKQG+N+ CPFH E TPSF+V+ +KQ +HCFGCGA GN FL + F E+V LA + ++ P + +G+ P E Q + + + L FY L A YL RG + E+I F IG+A WD + K + + AG+L+ + G Y DRFR RVMFPI D G V+ F GR LG+ PKY+NSPET +FHK + LY Y+A+ + R ++ EG+ DV + ++A++GTS T DH+++L R +I CYD D+AG +A +A E G ++R +PDG DPD ++K G E F+ + + ++ + AF +LS R L IS + G + +Y++Q');
828
last if( $count++ > @valid );
832
$searchio = new Bio::SearchIO('-format' => 'blast',
833
'-file' => Bio::Root::IO->catfile('t','data','dnaEbsub_ecoli.wutblastx'));
835
$result = $searchio->next_result;
836
ok($result->database_name, 'ecoli.nt');
837
ok($result->database_letters, 4662239);
838
ok($result->database_entries, 400);
839
ok($result->algorithm, 'TBLASTX');
840
ok($result->algorithm_version, qr/^2\.0MP\-WashU/);
841
ok($result->query_name, 'gi|142864|gb|M10040.1|BACDNAE');
842
ok($result->query_description, 'B.subtilis dnaE gene encoding DNA primase, complete cds');
843
ok($result->query_accession, 'M10040.1');
844
ok($result->query_length, 2001);
845
ok($result->get_parameter('matrix'), 'blosum62');
847
ok($result->get_statistic('lambda'), 0.318);
848
ok($result->get_statistic('kappa'), 0.135);
849
ok($result->get_statistic('entropy'),0.401 );
850
ok($result->get_statistic('dbentries'), 400);
852
@valid = ( [ 'gi|1789441|gb|AE000388.1|AE000388', 10334, 'AE000388', '6.4e-70', 318],
853
[ 'gi|2367383|gb|AE000509.1|AE000509', 10589, 'AE000509', '0.9992', 59]
857
while( my $hit = $result->next_hit ) {
858
my $d = shift @valid;
859
ok($hit->name, shift @$d);
860
ok($hit->length, shift @$d);
861
ok($hit->accession, shift @$d);
862
# using e here to deal with 0.9992 coming out right here as well
863
ok(sprintf("%g",$hit->significance), sprintf("%g",shift @$d) );
864
ok($hit->raw_score, shift @$d );
868
while( my $hsp = $hit->next_hsp ) {
870
if( $hspcounter == 3 ) {
871
# let's actually look at the 3rd HSP
872
ok($hsp->query->start, 441);
873
ok($hsp->query->end, 617);
874
ok($hsp->query->strand, 1);
875
ok($hsp->hit->start, 5192);
876
ok($hsp->hit->end, 5368);
877
ok($hsp->hit->strand, 1);
878
ok($hsp->length('hsp'), 59);
879
ok(sprintf("%g",$hsp->evalue), sprintf("%g",'6.4e-70'));
880
ok(sprintf("%g",$hsp->pvalue), sprintf("%g",'6.4e-70'));
883
ok(sprintf("%.2f",$hsp->percent_identity), '32.20');
884
ok(sprintf("%.4f",$hsp->frac_identical('hit')), 0.1073);
885
ok(sprintf("%.4f",$hsp->frac_identical('query')), 0.1073);
886
ok(sprintf("%.4f",$hsp->frac_conserved('hsp')), 0.4746);
887
ok($hsp->query->frame(), 2);
888
ok($hsp->hit->frame(), 1);
889
ok($hsp->gaps('query'), 0);
890
ok($hsp->gaps('hit'), 0);
892
ok($hsp->query_string, 'ALDYLLSRGFTKELINEFQIGYALDSWDFITKFLVKRGFSEAQMEKAGLLIRREDGSGY');
893
ok($hsp->hit_string, 'ARQYLEKRGLSHEVIARFAIGFAPPGWDNVLKRFGGNPENRQSLIDAGMLVTNDQGRSY');
894
ok($hsp->homology_string, 'A YL RG + E+I F IG+A WD + K + + AG+L+ + G Y');
897
} elsif( $count == 1 ) {
898
while( my $hsp = $hit->next_hsp ) {
899
ok($hsp->query->start, 587);
900
ok($hsp->query->end, 706);
901
ok($hsp->query->strand, -1);
902
ok($hsp->hit->start, 4108);
903
ok($hsp->hit->end, 4227);
904
ok($hsp->hit->strand, -1);
905
ok($hsp->length('hsp'), 40);
906
ok($hsp->evalue == '7.1');
907
ok($hsp->pvalue == '1.00');
910
ok(sprintf("%.2f",$hsp->percent_identity), '37.50');
911
ok(sprintf("%.4f",$hsp->frac_identical('hit')), '0.1250');
912
ok(sprintf("%.4f",$hsp->frac_identical('query')), '0.1250');
913
ok(sprintf("%.4f",$hsp->frac_conserved('hsp')), '0.4750');
914
ok($hsp->query->frame(), 2);
915
ok($hsp->hit->frame(), 2);
916
ok($hsp->gaps('query'), 0);
917
ok($hsp->gaps('hit'), 0);
919
ok($hsp->query_string, 'WLPRALPEKATTAP**SWIGNMTRFLKRSKYPLPSSRLIR');
920
ok($hsp->hit_string, 'WLSRTTVGSSTVSPRTFWITRMKVKLSSSKVTLPSTKSTR');
921
ok($hsp->homology_string, 'WL R +T +P WI M L SK LPS++ R');
925
last if( $count++ > @valid );
928
# Do a multiblast report test
929
$searchio = new Bio::SearchIO ('-format' => 'blast',
930
'-file' => Bio::Root::IO->catfile('t','data','multi_blast.bls'));
932
my @expected = qw(CATH_RAT CATL_HUMAN CATL_RAT PAPA_CARPA);
933
while( my $result = $searchio->next_result ) {
934
ok($result->query_name, shift @expected, "Multiblast query test");
938
# Test GCGBlast parsing
940
$searchio = new Bio::SearchIO('-format' => 'blast',
941
'-file' => Bio::Root::IO->catfile('t','data', 'test.gcgblast'));
942
$result = $searchio->next_result();
944
ok($result->query_name, '/v0/people/staji002/test.gcg');
945
ok($result->algorithm, 'BLASTP');
946
ok($result->algorithm_version, '2.2.1 [Apr-13-2001]');
947
ok($result->database_name, 'pir');
948
ok($result->database_entries, 274514);
949
ok($result->database_letters, 93460074);
951
$hit = $result->next_hit;
952
ok($hit->name, 'PIR2:S44629');
953
ok($hit->length, 628);
954
ok($hit->accession, 'PIR2:S44629');
955
skip('Significance parsing broken for GCG-BLAST Hits -- see HSP',$hit->significance, '2e-08' );
956
skip('Raw score parsing broken for GCG-BLAST Hits -- see HSP',$hit->raw_score, 57 );
958
$hsp = $hit->next_hsp;
959
ok(sprintf("%g",$hsp->evalue), sprintf("%g",'2e-08'));
960
ok($hsp->bits, '57.0');
961
ok($hsp->score, 136);
962
ok(int($hsp->percent_identity), 28);
963
ok(sprintf("%.2f",$hsp->frac_identical('query')), 0.29);
964
ok($hsp->frac_conserved('total'), 69/135);
965
ok($hsp->gaps('total'), 8);
966
ok($hsp->gaps('hit'), 6);
967
ok($hsp->gaps('query'), 2);
969
ok($hsp->hit->start, 342);
970
ok($hsp->hit->end, 470);
971
ok($hsp->query->start, 3);
972
ok($hsp->query->end, 135);
974
ok($hsp->query_string, 'CAAEFDFMEKETPLRYTKTXXXXXXXXXXXXXXRKIISDMWGVLAKQQTHVRKHQFDHGELVYHALQLLAYTALGILIMRLKLFLTPYMCVMASLICSRQLFGW--LFCKVHPGAIVFVILAAMSIQGSANLQTQ');
975
ok($hsp->hit_string, 'CSAEFDFIQYSTIEKLCGTLLIPLALISLVTFVFNFVKNT-NLLWRNSEEIG----ENGEILYNVVQLCCSTVMAFLIMRLKLFMTPHLCIVAALFANSKLLGGDRISKTIRVSALVGVI-AILFYRGIPNIRQQ');
976
ok($hsp->homology_string, 'C+AEFDF++ T + T + + +L + + ++GE++Y+ +QL T + LIMRLKLF+TP++C++A+L + +L G + + A+V VI A + +G N++ Q');
979
$searchio = new Bio::SearchIO ('-format' => 'blast',
551
980
'-file' => Bio::Root::IO->catfile('t','data','HUMBETGLOA.tblastx'));
553
982
$result = $searchio->next_result;
556
985
$hit = $result->next_hit;
986
ok($hit->accession, 'AE000479');
987
ok($hit->bits, 33.6);
557
988
$hsp = $hit->next_hsp;
989
ok($hit->hsp->bits,$hsp->bits);
558
991
ok($hsp->get_aln->isa('Bio::Align::AlignI'));
559
992
my $writer = Bio::SearchIO::Writer::HitTableWriter->new(
565
1000
frac_identical_query
569
1004
my $out = new Bio::SearchIO(-writer => $writer,
570
-file => ">searchio.out");
1005
-file => ">searchio.out");
571
1006
$out->write_result($result, 1);
572
1007
ok(-e 'searchio.out');
573
1008
my $writerhtml = new Bio::SearchIO::Writer::HTMLResultWriter();
574
1009
my $outhtml = new Bio::SearchIO(-writer => $writerhtml,
575
1010
-file => ">searchio.html");
1011
$outhtml->write_result($result, 1);
576
1012
ok(-e "searchio.html");
579
unlink 'searchio.out';
580
unlink 'searchio.html';
1014
unlink 'searchio.out';
1015
unlink 'searchio.html';
1017
#test all the database accession number formats
1018
$searchio = new Bio::SearchIO(-format => 'blast',
1019
-file => 't/data/testdbaccnums.out');
1020
$result = $searchio->next_result;
1022
@valid = ( ['pir||T14789','T14789','T14789','CAB53709','AAH01726'],
1023
['gb|NP_065733.1|CYT19', 'NP_065733','CYT19'],
1024
['emb|XP_053690.4|Cyt19','XP_053690'],
1025
['dbj|NP_056277.2|DKFZP586L0724','NP_056277'],
1026
['prf||XP_064862.2','XP_064862'],
1027
['pdb|BAB13968.1|1','BAB13968'],
1028
['sp|Q16478|GLK5_HUMAN','Q16478'],
1029
['pat|US|NP_002079.2','NP_002079'],
1030
['bbs|NP_079463.2|','NP_079463'],
1031
['gnl|db1|NP_002444.1','NP_002444'],
1032
['ref|XP_051877.1|','XP_051877'],
1033
['lcl|AAH16829.1|','AAH16829'],
1034
['gi|1|gb|NP_065733.1|CYT19','NP_065733'],
1035
['gi|2|emb|XP_053690.4|Cyt19','XP_053690'],
1036
['gi|3|dbj|NP_056277.2|DKFZP586L0724','NP_056277'],
1037
['gi|4|pir||T14789','T14789'],
1038
['gi|5|prf||XP_064862.2','XP_064862'],
1039
['gi|6|pdb|BAB13968.1|1','BAB13968'],
1040
['gi|7|sp|Q16478|GLK5_HUMAN','Q16478'],
1041
['gi|8|pat|US|NP_002079.2','NP_002079'],
1042
['gi|9|bbs|NP_079463.2|','NP_079463'],
1043
['gi|10|gnl|db1|NP_002444.1','NP_002444'],
1044
['gi|11|ref|XP_051877.1|','XP_051877'],
1045
['gi|12|lcl|AAH16829.1|','AAH16829'],
1046
['MY_test_ID','MY_test_ID']
1049
$hit = $result->next_hit;
1050
my $d = shift @valid;
1051
ok($hit->name, shift @$d);
1052
ok($hit->accession, shift @$d);
1053
my @accnums = $hit->each_accession_number;
1054
foreach my $a (@accnums) {
1058
$hit = $result->next_hit;
1059
ok($hit->name, shift @$d);
1060
ok($hit->accession, shift @$d);
1061
ok($hit->locus, shift @$d);
1063
while( $hit = $result->next_hit ) {
1064
my $d = shift @valid;
1065
ok($hit->name, shift @$d);
1066
ok($hit->accession, shift @$d);
1071
# parse the BLAST-like output
1072
my $infile = Bio::Root::IO->catfile(qw(t data 503384.MEGABLAST.2));
1073
my $in = new Bio::SearchIO(-file => $infile,
1074
-format => 'blast'); # this is megablast
1076
my $r = $in->next_result;
1077
my @dcompare = ( ['Contig3700', 5631, 785, '0.0', 785, '0.0', 396, 639, 12,
1078
8723,9434, 1, 4083, 4794, -1],
1079
['Contig3997', 12734, 664, '0.0', 664, '0.0', 335, 401, 0,
1080
1282, 1704, 1, 1546, 1968,-1 ],
1081
['Contig634', 858, 486, '1e-136', 486, '1e-136', 245, 304, 3,
1082
7620, 7941, 1, 1, 321, -1],
1083
['Contig1853', 2314, 339, '1e-91',339, '1e-91', 171, 204, 0,
1084
6406, 6620, 1, 1691, 1905, 1]
1087
ok($r->query_name, '503384');
1088
ok($r->query_description, '11337 bp 2 contigs');
1089
ok($r->query_length, 11337);
1090
ok($r->database_name, 'cneoA.nt ');
1091
ok($r->database_letters, 17206226);
1092
ok($r->database_entries, 4935);
1094
while( my $hit = $r->next_hit ) {
1095
my $d = shift @dcompare;
1096
ok($hit->name, shift @$d);
1097
ok($hit->length, shift @$d);
1098
ok($hit->raw_score, shift @$d);
1099
ok($hit->significance, shift @$d);
1101
my $hsp = $hit->next_hsp;
1102
ok($hsp->bits, shift @$d);
1103
ok($hsp->evalue, shift @$d);
1104
ok($hsp->score, shift @$d);
1105
ok($hsp->num_identical, shift @$d);
1106
ok($hsp->gaps('total'), shift @$d);
1107
ok($hsp->query->start, shift @$d);
1108
ok($hsp->query->end, shift @$d);
1109
ok($hsp->query->strand, shift @$d);
1110
ok($hsp->hit->start, shift @$d);
1111
ok($hsp->hit->end, shift @$d);
1112
ok($hsp->hit->strand, shift @$d);
1116
# parse the another megablast format
1118
$infile = Bio::Root::IO->catfile(qw(t data 503384.MEGABLAST.0));
1120
# this is megablast output type 0
1121
$in = new Bio::SearchIO(-file => $infile,
1122
-report_format => 0,
1123
-format => 'megablast');
1124
$r = $in->next_result;
1126
['Contig634', 7620, 7941, 1, 1, 321, -1],
1127
['Contig1853', 6406, 6620, 1, 1691, 1905, 1],
1128
['Contig3700', 8723,9434, 1, 4083, 4794, -1],
1129
['Contig3997', 1282, 1704, 1, 1546, 1968,-1 ],
1132
ok($r->query_name, '503384');
1134
while( my $hit = $r->next_hit ) {
1135
my $d = shift @dcompare;
1136
ok($hit->name, shift @$d);
1137
my $hsp = $hit->next_hsp;
1138
ok($hsp->query->start, shift @$d);
1139
ok($hsp->query->end, shift @$d);
1140
ok($hsp->query->strand, shift @$d);
1141
ok($hsp->hit->start, shift @$d);
1142
ok($hsp->hit->end, shift @$d);
1143
ok($hsp->hit->strand, shift @$d);
1148
# Let's test RPS-BLAST
1150
my $parser = new Bio::SearchIO(-format => 'blast',
1151
-file => Bio::Root::IO->catfile(qw(t data ecoli_domains.rpsblast)));
1153
$r = $parser->next_result;
1154
ok($r->query_name, 'gi|1786183|gb|AAC73113.1|');
1155
ok($r->num_hits, 7);
1156
$hit = $r->next_hit;
1157
ok($hit->name, 'gnl|CDD|3919');
1158
ok($hit->significance, 0.064);
1159
ok($hit->raw_score, 28);
1160
$hsp = $hit->next_hsp;
1161
ok($hsp->query->start, 599);
1162
ok($hsp->query->end,655);
1163
ok($hsp->hit->start,23);
1164
ok($hsp->hit->end,76);
1167
# Test PSI-BLAST parsing
1169
$searchio = new Bio::SearchIO ('-format' => 'blast',
1170
'-file' => Bio::Root::IO->catfile('t','data','psiblastreport.out'));
1172
$result = $searchio->next_result;
1174
ok($result->database_name, '/home/peter/blast/data/swissprot.pr');
1175
ok($result->database_entries, 88780);
1176
ok($result->database_letters, 31984247);
1178
ok($result->algorithm, 'BLASTP');
1179
ok($result->algorithm_version, qr/^2\.0\.14/);
1180
ok($result->query_name, 'CYS1_DICDI');
1181
ok($result->query_length, 343);
1182
ok($result->get_statistic('kappa') == 0.0491);
1183
ok($result->get_statistic('lambda') == 0.270);
1184
ok($result->get_statistic('entropy') == 0.230);
1185
ok($result->get_statistic('dbletters'), 31984247);
1186
ok($result->get_statistic('dbentries'), 88780);
1187
ok($result->get_statistic('effective_hsplength'), 49);
1188
ok($result->get_statistic('effectivespace'), 8124403938);
1189
ok($result->get_parameter('matrix'), 'BLOSUM62');
1190
ok($result->get_parameter('gapopen'), 11);
1191
ok($result->get_parameter('gapext'), 1);
1193
my @valid_hit_data = ( [ 'sp|P04988|CYS1_DICDI', 343, 'P04988', '0', 721],
1194
[ 'sp|P43295|A494_ARATH', 313, 'P43295', '1e-75', 281],
1195
[ 'sp|P25804|CYSP_PEA', 363, 'P25804', '1e-74', 278]);
1196
my @valid_iter_data = ( [ 127, 127, 0, 109, 18, 0, 0, 0, 0],
1197
[ 157, 40, 117, 2, 38, 0, 109, 3, 5]);
1199
while( $iter = $result->next_iteration ) {
1201
my $di = shift @valid_iter_data;
1202
ok($iter->number, $iter_count);
1204
ok($iter->num_hits, shift @$di);
1205
ok($iter->num_hits_new, shift @$di);
1206
ok($iter->num_hits_old, shift @$di);
1207
ok(scalar($iter->newhits_below_threshold), shift @$di);
1208
ok(scalar($iter->newhits_not_below_threshold), shift @$di);
1209
ok(scalar($iter->newhits_unclassified), shift @$di);
1210
ok(scalar($iter->oldhits_below_threshold), shift @$di);
1211
ok(scalar($iter->oldhits_newly_below_threshold), shift @$di);
1212
ok(scalar($iter->oldhits_not_below_threshold), shift @$di);
1215
if ($iter_count == 1) {
1216
while( $hit = $result->next_hit ) {
1217
my $d = shift @valid_hit_data;
1219
ok($hit->name, shift @$d);
1220
ok($hit->length, shift @$d);
1221
ok($hit->accession, shift @$d);
1222
ok(sprintf("%g",$hit->significance), sprintf("%g",shift @$d) );
1223
ok($hit->bits, shift @$d );
1225
if( $hit_count == 1 ) {
1226
while( my $hsp = $hit->next_hsp ){
1227
ok($hsp->query->start, 32);
1228
ok($hsp->query->end, 340);
1229
ok($hsp->hit->start, 3);
1230
ok($hsp->hit->end, 307);
1231
ok($hsp->length('hsp'), 316);
1232
ok($hsp->start('hit'), $hsp->hit->start);
1233
ok($hsp->end('query'), $hsp->query->end);
1234
ok($hsp->strand('sbjct'), $hsp->subject->strand);# alias for hit
1235
ok($hsp->evalue == 1e-75);
1236
ok($hsp->score, 712);
1237
ok($hsp->bits, 281);
1238
ok(sprintf("%.1f",$hsp->percent_identity), 46.5);
1239
ok(sprintf("%.4f",$hsp->frac_identical('query')), 0.4757);
1240
ok(sprintf("%.3f",$hsp->frac_identical('hit')), 0.482);
1244
last if( $hit_count++ > @valid_hit_data );
1251
$searchio = new Bio::SearchIO ( '-format' => 'blast',
1252
'-file' => Bio::Root::IO->catfile('t','data','ecolitst.bls'),
1253
'-signif' => 1e-100);
1255
@valid = qw(gb|AAC73113.1|);
1256
$r = $searchio->next_result;
1258
while( my $hit = $r->next_hit ) {
1259
ok($hit->name, shift @valid);
1262
$searchio = new Bio::SearchIO ( '-format' => 'blast',
1263
'-file' => Bio::Root::IO->catfile('t','data','ecolitst.bls'),
1266
@valid = qw(gb|AAC73113.1| gb|AAC76922.1| gb|AAC76994.1|);
1267
$r = $searchio->next_result;
1269
while( my $hit = $r->next_hit ) {
1270
ok($hit->name, shift @valid);
1273
$searchio = new Bio::SearchIO ( '-format' => 'blast',
1274
'-file' => Bio::Root::IO->catfile('t','data','ecolitst.bls'),
1277
@valid = qw(gb|AAC73113.1| gb|AAC76922.1|);
1278
$r = $searchio->next_result;
1280
while( my $hit = $r->next_hit ) {
1281
ok($hit->name, shift @valid);
1285
my $filt_func = sub{ my $hit=shift;
1286
$hit->frac_identical('query') >= 0.31
1289
$searchio = new Bio::SearchIO ( '-format' => 'blast',
1290
'-file' => Bio::Root::IO->catfile('t','data','ecolitst.bls'),
1291
'-hit_filter' => $filt_func);
1293
@valid = qw(gb|AAC73113.1| gb|AAC76994.1|);
1294
$r = $searchio->next_result;
1296
while( my $hit = $r->next_hit ) {
1297
ok($hit->name, shift @valid);
1303
# bl2seq parsing testing
1305
# this is blastp bl2seq
1306
$searchio = new Bio::SearchIO(-format => 'blast',
1307
-file => Bio::Root::IO->catfile(qw(t data
1309
$result = $searchio->next_result;
1311
ok($result->query_name, '');
1312
ok($result->algorithm, 'BLASTP');
1313
$hit = $result->next_hit;
1314
ok($hit->name, 'ALEU_HORVU');
1315
ok($hit->length, 362);
1316
$hsp = $hit->next_hsp;
1317
ok($hsp->score, 481);
1318
ok($hsp->bits, 191);
1319
ok(int $hsp->percent_identity, 34);
1320
ok($hsp->evalue, '2e-53');
1321
ok(int($hsp->frac_conserved*$hsp->length), 167);
1322
ok($hsp->query->start, 28);
1323
ok($hsp->query->end, 343);
1324
ok($hsp->hit->start, 60);
1325
ok($hsp->hit->end,360);
1328
# this is blastn bl2seq
1329
$searchio = new Bio::SearchIO(-format => 'blast',
1330
-file => Bio::Root::IO->catfile
1331
(qw(t data bl2seq.blastn.rev)));
1332
$result = $searchio->next_result;
1334
ok($result->query_name, '');
1335
ok($result->algorithm, 'BLASTN');
1336
ok($result->query_length, 180);
1337
$hit = $result->next_hit;
1338
ok($hit->length, 179);
1339
ok($hit->name, 'human');
1340
$hsp = $hit->next_hsp;
1341
ok($hsp->score, 27);
1342
ok($hsp->bits, '54.0');
1343
ok(int $hsp->percent_identity, 88);
1344
ok($hsp->evalue, '2e-12');
1345
ok(int($hsp->frac_conserved*$hsp->length), 83);
1346
ok($hsp->query->start, 94);
1347
ok($hsp->query->end, 180);
1348
ok($hsp->query->strand, 1);
1349
ok($hsp->hit->strand, -1);
1350
ok($hsp->hit->start, 1);
1351
ok($hsp->hit->end,94);
1354
# this is blastn bl2seq
1355
$searchio = new Bio::SearchIO(-format => 'blast',
1356
-file => Bio::Root::IO->catfile
1357
(qw(t data bl2seq.blastn)));
1358
$result = $searchio->next_result;
1360
ok($result->query_name, '');
1361
ok($result->query_length, 180);
1362
ok($result->algorithm, 'BLASTN');
1363
$hit = $result->next_hit;
1364
ok($hit->name, 'human');
1365
ok($hit->length, 179);
1366
$hsp = $hit->next_hsp;
1367
ok($hsp->score, 27);
1368
ok($hsp->bits, '54.0');
1369
ok(int $hsp->percent_identity, 88);
1370
ok($hsp->evalue, '2e-12');
1371
ok(int($hsp->frac_conserved*$hsp->length), 83);
1372
ok($hsp->query->start, 94);
1373
ok($hsp->query->end, 180);
1374
ok($hsp->query->strand, 1);
1375
ok($hsp->hit->strand, 1);
1376
ok($hsp->hit->start, 86);
1377
ok($hsp->hit->end,179);
1380
# this is blastp bl2seq
1381
$searchio = new Bio::SearchIO(-format => 'blast',
1382
-file => Bio::Root::IO->catfile
1383
(qw(t data bl2seq.bug940.out)));
1384
$result = $searchio->next_result;
1386
ok($result->query_name, 'zinc');
1387
ok($result->algorithm, 'BLASTP');
1388
ok($result->query_description, 'finger protein 135 (clone pHZ-17) [Homo sapiens]. neo_id RS.ctg14243-000000.6.0');
1389
ok($result->query_length, 469);
1390
$hit = $result->next_hit;
1391
ok($hit->name, 'gi|4507985|');
1392
ok($hit->description,'zinc finger protein 135 (clone pHZ-17) [Homo sapiens]. neo_id RS.ctg14243-000000.6.0');
1393
ok($hit->length, 469);
1394
$hsp = $hit->next_hsp;
1395
ok($hsp->score, 1626);
1396
ok($hsp->bits, 637);
1397
ok(int $hsp->percent_identity, 66);
1398
ok($hsp->evalue, '0.0');
1399
ok(int($hsp->frac_conserved*$hsp->length), 330);
1400
ok($hsp->query->start, 121);
1401
ok($hsp->query->end, 469);
1402
ok($hsp->hit->start, 1);
1403
ok($hsp->hit->end,469);
1404
ok($hsp->gaps, 120);
1405
ok($hit->next_hsp); # there is more than one HSP here,
1406
# make sure it is parsed at least
1408
# cannot distinguish between blastx and tblastn reports
1409
# so we're only testing a blastx report for now
1411
# this is blastx bl2seq
1412
$searchio = new Bio::SearchIO(-format => 'blast',
1413
-file => Bio::Root::IO->catfile
1414
(qw(t data bl2seq.blastx.out)));
1415
$result = $searchio->next_result;
1417
ok($result->query_name, 'AE000111.1');
1418
ok($result->query_description, 'Escherichia coli K-12 MG1655 section 1 of 400 of the complete genome');
1419
ok($result->algorithm, 'BLASTX');
1420
ok($result->query_length, 720);
1421
$hit = $result->next_hit;
1422
ok($hit->name, 'AK1H_ECOLI');
1423
ok($hit->description,'P00561 Bifunctional aspartokinase/homoserine dehydrogenase I (AKI-HDI) [Includes: Aspartokinase I ; Homoserine dehydrogenase I ]');
1424
ok($hit->length, 820);
1425
$hsp = $hit->next_hsp;
1426
ok($hsp->score, 634);
1427
ok($hsp->bits, 248);
1428
ok(int $hsp->percent_identity, 100);
1429
ok($hsp->evalue, '2e-70');
1430
ok(int($hsp->frac_conserved*$hsp->length), 128);
1431
ok($hsp->query->start, 1);
1432
ok($hsp->query->end, 384);
1433
ok($hsp->hit->start, 1);
1434
ok($hsp->hit->end,128);
1436
ok($hsp->query->frame,0);
1437
ok($hsp->hit->frame,0);
1438
ok($hsp->query->strand,-1);
1439
ok($hsp->hit->strand,0);
1441
# this is tblastx bl2seq (self against self)
1442
$searchio = new Bio::SearchIO(-format => 'blast',
1443
-file => Bio::Root::IO->catfile
1444
(qw(t data bl2seq.tblastx.out)));
1445
$result = $searchio->next_result;
1447
ok($result->query_name, 'Escherichia');
1448
ok($result->algorithm, 'TBLASTX');
1449
ok($result->query_description, 'coli K-12 MG1655 section 1 of 400 of the complete genome');
1450
ok($result->query_length, 720);
1451
$hit = $result->next_hit;
1452
ok($hit->name, 'gi|1786181|gb|AE000111.1|AE000111');
1454
ok($hit->description,'Escherichia coli K-12 MG1655 section 1 of 400 of the complete genome');
1455
ok($hit->length, 720);
1456
$hsp = $hit->next_hsp;
1457
ok($hsp->score, 1118);
1458
ok($hsp->bits, 515);
1459
ok(int $hsp->percent_identity, 95);
1460
ok($hsp->evalue, '1e-151');
1461
ok(int($hsp->frac_conserved*$hsp->length), 229);
1462
ok($hsp->query->start, 1);
1463
ok($hsp->query->end, 720);
1464
ok($hsp->hit->start, 1);
1465
ok($hsp->hit->end,720);
1467
ok($hsp->query->frame,0);
1468
ok($hsp->hit->frame,0);
1469
ok($hsp->query->strand,1);
1470
ok($hsp->hit->strand,1);
1472
# test blasttable output
1475
c200-vs-yeast.BLASTN.m9);
1476
$searchio = new Bio::SearchIO(-file => Bio::Root::IO->catfile
1477
(qw(t data c200-vs-yeast.BLASTN)),
1478
-format => 'blast');
1479
$result = $searchio->next_result;
1481
my %ref = &result2hash($result);
1482
ok( scalar keys %ref, 67);
1483
$searchio = new Bio::SearchIO(-file => Bio::Root::IO->catfile
1484
(qw(t data c200-vs-yeast.BLASTN.m8)),
1485
-program_name => 'BLASTN',
1486
-format => 'blasttable');
1487
$result = $searchio->next_result;
1488
my %tester = &result2hash($result);
1489
foreach my $key ( sort keys %ref ) {
1490
ok($tester{$key}, $ref{$key},$key);
1496
# Test Blast parsing with B=0 (WU-BLAST)
1497
$searchio = new Bio::SearchIO(-file => Bio::Root::IO->catfile
1498
(qw(t data no_hsps.blastp)),
1499
-format => 'blast');
1500
$result = $searchio->next_result;
1501
ok($result->query_name, 'mgri:MG00189.3');
1502
$hit = $result->next_hit;
1503
ok($hit->name, 'mgri:MG00189.3');
1504
ok($hit->description, 'hypothetical protein 6892 8867 +');
1505
ok($hit->score, 3098);
1506
ok($hit->significance, '0.');
1508
$hit = $result->next_hit;
1509
ok($hit->name, 'fgram:FG01141.1');
1510
ok($hit->description, 'hypothetical protein 47007 48803 -');
1511
ok($hit->score, 2182);
1512
ok($hit->significance, '4.2e-226');
1513
ok($result->num_hits, 415);
1514
# Let's now test if _guess_format is doing its job correctly
1515
my %pair = ( 'filename.blast' => 'blast',
1516
'filename.bls' => 'blast',
1518
'f.tblx' => 'blast',
1519
'fast.bls' => 'blast',
1520
'f.fasta' => 'fasta',
1524
'f.ssearch' => 'fasta',
1525
'f.SSEARCH.m9' => 'fasta',
1527
'f.psearch' => 'fasta',
1528
'f.osearch' => 'fasta',
1529
'f.exon' => 'exonerate',
1530
'f.exonerate' => 'exonerate',
1531
'f.blastxml' => 'blastxml',
1532
'f.xml' => 'blastxml');
1533
while( my ($file,$expformat) = each %pair ) {
1534
ok(Bio::SearchIO->_guess_format($file),$expformat, "$expformat for $file");
1538
# Test Wes Barris's reported bug when parsing blastcl3 output which
1539
# has integer overflow
1541
$searchio = new Bio::SearchIO(-file => Bio::Root::IO->catfile
1542
(qw(t data hsinsulin.blastcl3.blastn)),
1543
-format => 'blast');
1544
$result = $searchio->next_result;
1545
ok($result->query_name, 'human');
1546
ok($result->database_letters(), '-24016349');
1547
# this is of course not the right length, but is the what blastcl3
1548
# reports, the correct value is
1549
ok($result->get_statistic('dbletters'),'192913178');
1550
ok($result->get_statistic('dbentries'),'1867771');
1554
# a utility function for comparing result objects
1558
$hash{'query_name'} = $result->query_name;
1561
foreach my $hit ( $result->hits ) {
1562
$hash{"hit$hitcount\_name"} = $hit->name;
1563
# only going to test order of magnitude
1564
# too hard as these don't always match
1565
# $hash{"hit$hitcount\_signif"} =
1566
# ( sprintf("%.0e",$hit->significance) =~ /e\-?(\d+)/ );
1567
$hash{"hit$hitcount\_bits"} = sprintf("%d",$hit->bits);
1569
foreach my $hsp ( $hit->hsps ) {
1570
$hash{"hsp$hspcount\_bits"} = sprintf("%d",$hsp->bits);
1571
# only going to test order of magnitude
1572
# too hard as these don't always match
1573
# $hash{"hsp$hspcount\_evalue"} =
1574
# ( sprintf("%.0e",$hsp->evalue) =~ /e\-?(\d+)/ );
1575
$hash{"hsp$hspcount\_qs"} = $hsp->query->start;
1576
$hash{"hsp$hspcount\_qe"} = $hsp->query->end;
1577
$hash{"hsp$hspcount\_qstr"} = $hsp->query->strand;
1578
$hash{"hsp$hspcount\_hs"} = $hsp->hit->start;
1579
$hash{"hsp$hspcount\_he"} = $hsp->hit->end;
1580
$hash{"hsp$hspcount\_hstr"} = $hsp->hit->strand;
1582
#$hash{"hsp$hspcount\_pid"} = sprintf("%d",$hsp->percent_identity);
1583
#$hash{"hsp$hspcount\_fid"} = sprintf("%.2f",$hsp->frac_identical);
1584
$hash{"hsp$hspcount\_gaps"} = $hsp->gaps('total');
1595
Useful for debugging:
1597
if ($iter_count == 3) {
1599
foreach ($iter->newhits) {
1600
print " " . $_->name . "\n";
1602
print "\nOLDHITS:\n";
1603
foreach ($iter->oldhits) {
1604
print " " . $_->name . "\n";
1606
print "\nNEWHITS BELOW:\n";
1607
foreach ($iter->newhits_below_threshold) {
1608
print " " . $_->name . "\n";
1610
print "\nNEWHITS NOT BELOW:\n";
1611
foreach ($iter->newhits_not_below_threshold) {
1612
print " " . $_->name . "\n";
1614
print "\nNEWHITS UNCLASSIFIED:\n";
1615
foreach ($iter->newhits_unclassified) {
1616
print " " . $_->name . "\n";
1618
print "\nOLDHITS BELOW:\n";
1619
foreach ($iter->oldhits_below_threshold) {
1620
print " " . $_->name . "\n";
1622
print "\nOLDHITS NEWLY BELOW:\n";
1623
foreach ($iter->oldhits_newly_below_threshold) {
1624
print " " . $_->name . "\n";
1626
print "\nOLDHITS NOT BELOW:\n";
1627
foreach ($iter->oldhits_not_below_threshold) {
1628
print " " . $_->name . "\n";