175
179
'Hsp_hit-frame' => 'HSP-hit_frame',
176
180
'Hsp_links' => 'HSP-links',
177
181
'Hsp_group' => 'HSP-hsp_group',
182
'Hsp_features' => 'HSP-hit_features',
179
184
'Hit_id' => 'HIT-name',
180
185
'Hit_len' => 'HIT-length',
181
186
'Hit_accession' => 'HIT-accession',
182
187
'Hit_def' => 'HIT-description',
183
188
'Hit_signif' => 'HIT-significance',
185
189
# For NCBI blast, the description line contains bits.
186
190
# For WU-blast, the description line contains score.
187
191
'Hit_score' => 'HIT-score',
370
375
my ( $self, @args ) = @_;
371
376
$self->SUPER::_initialize(@args);
373
# Blast reports require a specialized version of the SREB due to the
374
# possibility of iterations (PSI-BLAST). Forwarding all arguments to it.
375
# An issue here is that we want to set new default object factories if none are
378
# Blast reports require a specialized version of the SREB due to the
379
# possibility of iterations (PSI-BLAST). Forwarding all arguments to it. An
380
# issue here is that we want to set new default object factories if none are
378
my $handler = new Bio::SearchIO::IteratedSearchResultEventBuilder(@args);
383
my $handler = Bio::SearchIO::IteratedSearchResultEventBuilder->new(@args);
379
384
$self->attach_EventHandler($handler);
381
# 2006-04-26 move this to the attach_handler function in this
382
# module so we can really reset the handler
386
# 2006-04-26 move this to the attach_handler function in this module so we
387
# can really reset the handler
383
388
# Optimization: caching
384
389
# the EventHandler since it is used a lot during the parse.
385
391
# $self->{'_handler_cache'} = $handler;
387
393
my ( $min_qlen, $check_all, $overlap, $best, $rpttype ) = $self->_rearrange(
436
443
my $gapped_stats = 0; # for switching between gapped/ungapped
438
445
local $_ = "\n"; #consistency
439
447
while ( defined( $_ = $self->_readline ) ) {
440
448
next if (/^\s+$/); # skip empty lines
441
449
next if (/CPU time:/);
442
450
next if (/^>\s*$/);
444
/^([T]?BLAST[NPX])\s*(.+)$/i # NCBI BLAST
445
|| /^(PSITBLASTN)\s+(.+)$/i # PSIBLAST
446
|| /^(RPS-BLAST)\s*(.+)$/i # RPSBLAST
447
|| /^(MEGABLAST)\s*(.+)$/i # MEGABLAST
452
/^((?:\S+?)?BLAST[NPX]?)\s+(.+)$/i # NCBI BLAST, PSIBLAST
453
# RPSBLAST, MEGABLAST
448
454
|| /^(P?GENEWISE|HFRAME|SWN|TSWN)\s+(.+)/i #Paracel BTK
451
$self->debug("blast.pm: Start of new report: $1 $2\n");
457
($reporttype, my $reportversion) = ($1, $2);
458
# need to keep track of whether this is WU-BLAST
459
if ($reportversion && $reportversion =~ m{WashU$}) {
460
$self->{'_wublast'}++;
462
$self->debug("blast.pm: Start of new report: $reporttype, $reportversion\n");
452
463
if ( $self->{'_seentop'} ) {
453
464
# This handles multi-result input streams
454
465
$self->_pushback($_);
455
$self->in_element('hsp')
456
&& $self->end_element( { 'Name' => 'Hsp' } );
457
$self->in_element('hit')
458
&& $self->end_element( { 'Name' => 'Hit' } );
459
$self->within_element('iteration')
460
&& $self->end_element( { 'Name' => 'Iteration' } );
461
$self->end_element( { 'Name' => 'BlastOutput' } );
462
return $self->end_document();
464
468
$self->_start_blastoutput;
466
469
if ($reporttype =~ /RPS-BLAST/) {
467
470
$reporttype .= '(BLASTP)'; # default RPS-BLAST type
499
502
if ( defined $seeniteration ) {
500
503
$self->within_element('iteration')
501
504
&& $self->end_element( { 'Name' => 'Iteration' } );
502
$self->_start_iteration;
505
$self->start_element( { 'Name' => 'Iteration' } );
505
$self->_start_iteration;
508
$self->start_element( { 'Name' => 'Iteration' } );
507
510
$seeniteration = 1;
509
512
elsif (/^Query=\s*(.*)$/) {
510
514
$self->debug("blast.pm: Query= found...$_\n");
513
516
if ( defined $seenquery ) {
514
517
$self->_pushback($reportline) if $reportline;
515
518
$self->_pushback($_);
516
$self->in_element('hsp')
517
&& $self->end_element( { 'Name' => 'Hsp' } );
518
$self->in_element('hit')
519
&& $self->end_element( { 'Name' => 'Hit' } );
520
$self->within_element('iteration')
521
&& $self->end_element( { 'Name' => 'Iteration' } );
525
'Name' => 'BlastOutput_program',
526
'Data' => $reporttype
530
$self->end_element( { 'Name' => 'BlastOutput' } );
531
return $self->end_document();
534
522
if ( !defined $reporttype ) {
591
my ( $acc, $version ) = &_get_accession_version($nm);
579
my ( $gi, $acc, $version ) = $self->_get_seq_identifiers($nm);
592
580
$version = defined($version) && length($version) ? ".$version" : "";
593
$acc = '' unless defined($acc);
596
583
'Name' => 'BlastOutput_query-acc',
597
584
'Data' => "$acc$version"
588
# added check for WU-BLAST -echofilter option (bug 2388)
589
elsif (/^>Unfiltered[+-]1$/) {
590
# skip all of the lines of unfiltered sequence
591
while($_ !~ /^Database:/) {
592
$self->debug("Bypassing features line: $_");
593
$_ = $self->_readline;
595
$self->_pushback($_);
601
597
elsif (/Sequences producing significant alignments:/) {
602
598
$self->debug("blast.pm: Processing NCBI-BLAST descriptions\n");
603
599
$flavor = 'ncbi';
611
607
# for a line with a leading >. Blank lines occur with this section
613
609
if ( !$self->in_element('iteration') ) {
614
$self->_start_iteration;
610
$self->start_element( { 'Name' => 'Iteration' } );
617
while ( defined( $_ = $self->_readline() ) ) {
619
|| /^\s+Database:\s+?/
625
$self->_pushback($_); # Catch leading > (end of section)
628
elsif (/([\d\.\+\-eE]+)\s+([\d\.\+\-eE]+)(\s+\d+)?\s*$/) {
630
# the last match is for gapped BLAST output
631
# which will report the number of HSPs for the Hit
632
my ( $score, $evalue ) = ( $1, $2 );
612
# these elements are dropped with some multiquery reports; add
616
'Name' => 'BlastOutput_db-len',
617
'Data' => $self->{'_blsdb_length'}
619
) if $self->{'_blsdb_length'};
622
'Name' => 'BlastOutput_db-let',
623
'Data' => $self->{'_blsdb_letters'}
625
) if $self->{'_blsdb_letters'};
628
'Name' => 'BlastOutput_db',
629
'Data' => $self->{'_blsdb'}
631
) if $self->{'_blsdb_letters'};
633
# changed 8/28/2008 to exit hit table if blank line is found after an
638
while ( defined( my $descline = $self->_readline() ) ) {
639
if ($descline =~ m{^\s*$}) {
640
last DESCLINE if $seen_block;
643
# any text match is part of block...
645
# GCG multiline oddness...
646
if ($descline =~ /^(\S+)\s+Begin:\s\d+\s+End:\s+\d+/xms) {
647
my ($id, $nextline) = ($1, $self->_readline);
648
$nextline =~ s{^!}{};
649
$descline = "$id $nextline";
651
# NCBI style hit table (no N)
652
if ($descline =~ /(?<!cor) # negative lookahead
653
(\d*\.?(?:[\+\-eE]+)?\d+) # number (float or scientific notation)
655
(\d*\.?(?:[\+\-eE]+)?\d+) # number (float or scientific notation)
658
my ( $score, $evalue ) = ($1, $2);
634
660
# Some data clean-up so e-value will appear numeric to perl
635
661
$evalue =~ s/^e/1e/i;
637
663
# This to handle no-HSP case
664
my @line = split ' ',$descline;
640
666
# we want to throw away the score, evalue
641
667
pop @line, pop @line;
647
673
# add the last 2 entries s.t. we can reconstruct
648
674
# a minimal Hit object at the end of the day
650
[ $evalue, $score, shift @line, join( ' ', @line ) ];
653
elsif (/^CONVERGED/i) {
675
push @hit_signifs, [ $evalue, $score, shift @line, join( ' ', @line ) ];
676
} elsif ($descline =~ /^CONVERGED/i) {
656
679
'Name' => 'Iteration_converged',
684
$self->_pushback($descline); # Catch leading > (end of section)
661
@hit_signifs = sort {$a->[0] <=> $b->[0]} @hit_signifs;
664
689
elsif (/Sequences producing High-scoring Segment Pairs:/) {
770
my ( $acc, $version ) = &_get_accession_version($id);
804
my ($gi, $acc, $version ) = $self->_get_seq_identifiers($id);
773
807
'Name' => 'Hit_accession',
778
811
# add hit significance (from the hit table)
779
# this is where Bug 1986 goes awry
781
my $v = shift @hit_signifs;
785
'Name' => 'Hit_signif',
791
'Name' => 'Hit_score',
812
# this is where Bug 1986 went awry
814
# Changed for Bug2409; hit->significance and hit->score/bits derived
815
# from HSPs, not hit table unless necessary
818
while (my $v = shift @hit_signifs) {
819
my $tableid = $v->[2];
820
if ($tableid !~ m{\Q$id\E}) {
821
$self->debug("Hit table ID $tableid doesn't match current hit id $id, checking next hit table entry...\n");
796
827
while ( defined( $_ = $self->_readline() ) ) {
797
828
next if (/^\s+$/);
2339
2312
$self->{'_check_all'};
2342
=head2 _get_accession_version
2344
Title : _get_accession_version
2345
Usage : my ($acc,$ver) = &_get_accession_version($id)
2346
Function:Private function to get an accession,version pair
2347
for an ID (if it is in NCBI format)
2348
Returns : 2-pule of accession, version
2349
Args : ID string to process
2354
sub _get_accession_version {
2357
# handle case when this is accidently called as a class method
2358
if ( ref($id) && $id->isa('Bio::SearchIO') ) {
2361
return unless defined $id;
2362
my ( $acc, $version );
2363
if ( $id =~ /(gb|emb|dbj|sp|pdb|bbs|ref|lcl)\|(.*)\|(.*)/ ) {
2364
( $acc, $version ) = split /\./, $2;
2366
elsif ( $id =~ /(pir|prf|pat|gnl)\|(.*)\|(.*)/ ) {
2367
( $acc, $version ) = split /\./, $3;
2371
#punt, not matching the db's at ftp://ftp.ncbi.nih.gov/blast/db/README
2372
#Database Name Identifier Syntax
2373
#============================ ========================
2374
#GenBank gb|accession|locus
2375
#EMBL Data Library emb|accession|locus
2376
#DDBJ, DNA Database of Japan dbj|accession|locus
2377
#NBRF PIR pir||entry
2378
#Protein Research Foundation prf||name
2379
#SWISS-PROT sp|accession|entry name
2380
#Brookhaven Protein Data Bank pdb|entry|chain
2381
#Patents pat|country|number
2382
#GenInfo Backbone Id bbs|number
2383
#General database identifier gnl|database|identifier
2384
#NCBI Reference Sequence ref|accession|locus
2385
#Local Sequence identifier lcl|identifier
2388
return ( $acc, $version );
2315
# commented out, using common base class util method
2316
#=head2 _get_accession_version
2318
# Title : _get_accession_version
2319
# Usage : my ($acc,$ver) = &_get_accession_version($id)
2320
# Function:Private function to get an accession,version pair
2321
# for an ID (if it is in NCBI format)
2322
# Returns : 2-pule of accession, version
2323
# Args : ID string to process
2328
#sub _get_accession_version {
2331
# # handle case when this is accidently called as a class method
2332
# if ( ref($id) && $id->isa('Bio::SearchIO') ) {
2335
# return unless defined $id;
2336
# my ( $acc, $version );
2337
# if ( $id =~ /(gb|emb|dbj|sp|pdb|bbs|ref|lcl)\|(.*)\|(.*)/ ) {
2338
# ( $acc, $version ) = split /\./, $2;
2340
# elsif ( $id =~ /(pir|prf|pat|gnl)\|(.*)\|(.*)/ ) {
2341
# ( $acc, $version ) = split /\./, $3;
2345
# #punt, not matching the db's at ftp://ftp.ncbi.nih.gov/blast/db/README
2346
# #Database Name Identifier Syntax
2347
# #============================ ========================
2348
# #GenBank gb|accession|locus
2349
# #EMBL Data Library emb|accession|locus
2350
# #DDBJ, DNA Database of Japan dbj|accession|locus
2351
# #NBRF PIR pir||entry
2352
# #Protein Research Foundation prf||name
2353
# #SWISS-PROT sp|accession|entry name
2354
# #Brookhaven Protein Data Bank pdb|entry|chain
2355
# #Patents pat|country|number
2356
# #GenInfo Backbone Id bbs|number
2357
# #General database identifier gnl|database|identifier
2358
# #NCBI Reference Sequence ref|accession|locus
2359
# #Local Sequence identifier lcl|identifier
2362
# return ( $acc, $version );
2365
# general private method used to make minimal hits from leftover
2366
# data in the hit table
2369
my ($self, $hits) = @_;
2370
while ( my $v = shift @{ $hits }) {
2371
next unless defined $v;
2372
$self->start_element( { 'Name' => 'Hit' } );
2381
my ($gi, $acc, $version ) = $self->_get_seq_identifiers($id);
2384
'Name' => 'Hit_accession',
2391
'Name' => 'Hit_signif',
2395
if (exists $self->{'_wublast'}) {
2398
'Name' => 'Hit_score',
2405
'Name' => 'Hit_bits',
2414
'Name' => 'Hit_def',
2418
$self->end_element( { 'Name' => 'Hit' } );