1
# $Id: BPlite.pm,v 1.42.4.1 2006/10/02 23:10:31 sendu Exp $
2
##############################################################################
3
# Bioperl module Bio::Tools::BPlite
4
##############################################################################
6
# The original BPlite.pm module has been written by Ian Korf !
7
# see http://sapiens.wustl.edu/~ikorf
9
# You may distribute this module under the same terms as perl itself
13
Bio::Tools::BPlite - Lightweight BLAST parser
17
use Bio::Tools::BPlite;
18
my $report = new Bio::Tools::BPlite(-fh=>\*STDIN);
23
while(my $sbjct = $report->nextSbjct) {
25
while (my $hsp = $sbjct->nextHSP) {
42
$hsp->hit->overlaps($exon);
46
# the following line takes you to the next report in the stream/file
47
# it will return 0 if that report is empty,
48
# but that is valid for an empty blast report.
51
last if ($report->_parseHeader == -1);
58
BPlite is a package for parsing BLAST reports. The BLAST programs are a family
59
of widely used algorithms for sequence database searches. The reports are
60
non-trivial to parse, and there are differences in the formats of the various
61
flavors of BLAST. BPlite parses BLASTN, BLASTP, BLASTX, TBLASTN, and TBLASTX
62
reports from both the high performance WU-BLAST, and the more generic
65
Many people have developed BLAST parsers (I myself have made at least three).
66
BPlite is for those people who would rather not have a giant object
67
specification, but rather a simple handle to a BLAST report that works well
72
BPlite has three kinds of objects, the report, the subject, and the HSP. To
73
create a new report, you pass a filehandle reference to the BPlite constructor.
75
my $report = new Bio::Tools::BPlite(-fh=>\*STDIN); # or any other filehandle
77
The report has two attributes (query and database), and one method (nextSbjct).
79
$report->query; # access to the query name
80
$report->database; # access to the database name
81
$report->nextSbjct; # gets the next subject
82
while(my $sbjct = $report->nextSbjct) {
83
# canonical form of use is in a while loop
86
A subject is a BLAST hit, which should not be confused with an HSP (below). A
87
BLAST hit may have several alignments associated with it. A useful way of
88
thinking about it is that a subject is a gene and HSPs are the exons. Subjects
89
have one attribute (name) and one method (nextHSP).
91
$sbjct->name; # access to the subject name
92
$sbjct->nextHSP; # gets the next HSP from the sbjct
93
while(my $hsp = $sbjct->nextHSP) {
94
# canonical form is again a while loop
97
An HSP is a high scoring pair, or simply an alignment. HSP objects
98
inherit all the useful methods from RangeI/SeqFeatureI/FeaturePair,
99
but provide an additional set of attributes (score, bits, percent, P,
100
match, EXP, positive, length, querySeq, sbjctSeq, homologySeq) that
101
should be familiar to anyone who has seen a blast report.
103
For lazy/efficient coders, two-letter abbreviations are available for the
104
attributes with long names (qs, ss, hs). Ranges of the aligned sequences in
105
query/subject and other information (like seqname) are stored
106
in SeqFeature objects (i.e.: $hsp-E<gt>query, $hsp-E<gt>subject which is equal to
107
$hsp-E<gt>feature1, $hsp-E<gt>feature2). querySeq, sbjctSeq and homologySeq do only
108
contain the alignment sequences from the blast report.
117
$hsp->querySeq; $hsp->qs;
118
$hsp->sbjctSeq; $hsp->ss;
119
$hsp->homologySeq; $hsp->hs;
123
$hsp->hit->primary_tag; # "similarity"
124
$hsp->hit->source_tag; # "BLAST"
129
So a very simple look into a BLAST report might look like this.
131
my $report = new Bio::Tools::BPlite(-fh=>\*STDIN);
132
while(my $sbjct = $report->nextSbjct) {
133
print ">",$sbjct->name,"\n";
134
while(my $hsp = $sbjct->nextHSP) {
135
print "\t",$hsp->start,"..",$hsp->end," ",$hsp->bits,"\n";
139
The output of such code might look like this:
151
Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf),
152
Lorenz Pollak (lorenz@ist.org, bioperl port)
154
=head1 ACKNOWLEDGEMENTS
156
This software was developed at the Genome Sequencing Center at Washington
157
Univeristy, St. Louis, MO.
161
Jason Stajich, jason@cgt.mc.duke.edu
165
Copyright (C) 1999 Ian Korf. All Rights Reserved.
169
This software is provided "as is" without warranty of any kind.
173
package Bio::Tools::BPlite;
177
use Bio::Tools::BPlite::Sbjct; # we want to use Sbjct
180
use base qw(Bio::Root::Root Bio::SeqAnalysisParserI Bio::Root::IO);
182
# new comes from a RootI now
187
Function: Create a new Bio::Tools::BPlite object
188
Returns : Bio::Tools::BPlite
189
Args : -file input file (alternative to -fh)
190
-fh input stream (alternative to -file)
195
my ($class, @args) = @_;
196
my $self = $class->SUPER::new(@args);
197
$self->warn("Use of Bio::Tools::BPlite is deprecated".
198
"Use Bio::SearchIO classes instead");
200
$self->_initialize_io(@args);
202
$self->{'QPATLOCATION'} = []; # Anonymous array of query pattern locations for PHIBLAST
204
if ($self->_parseHeader) {$self->{'REPORT_DONE'} = 0} # there are alignments
205
else {$self->{'REPORT_DONE'} = 1} # empty report
207
return $self; # success - we hope!
210
# for SeqAnalysisParserI compliance
215
Usage : while( my $feat = $res->next_feature ) { # do something }
216
Function: SeqAnalysisParserI implementing function. This implementation
217
iterates over all HSPs. If the HSPs of the current subject match
218
are exhausted, it will automatically call nextSbjct().
220
Returns : A Bio::SeqFeatureI compliant object, in this case a
221
Bio::Tools::BPlite::HSP object, and FALSE if there are no more
230
$sbjct = $self->{'_current_sbjct'};
231
unless( defined $sbjct ) {
232
$sbjct = $self->{'_current_sbjct'} = $self->nextSbjct;
233
return unless defined $sbjct;
235
$hsp = $sbjct->nextHSP;
236
unless( defined $hsp ) {
237
$self->{'_current_sbjct'} = undef;
238
return $self->next_feature;
240
return $hsp || undef;
246
Usage : $query = $obj->query();
247
Function : returns the query object
249
Returns : query object
254
sub query {shift->{'QUERY'}}
259
Usage : $len = $obj->qlength();
260
Function : returns the length of the query
262
Returns : length of query
267
sub qlength {shift->{'LENGTH'}}
272
Usage : $pattern = $obj->pattern();
273
Function : returns the pattern used in a PHIBLAST search
277
sub pattern {shift->{'PATTERN'}}
279
=head2 query_pattern_location
281
Title : query_pattern_location
282
Usage : $qpl = $obj->query_pattern_location();
283
Function : returns reference to array of locations in the query sequence
284
of pattern used in a PHIBLAST search
288
sub query_pattern_location {shift->{'QPATLOCATION'}}
293
Usage : $db = $obj->database();
294
Function : returns the database used in this search
296
Returns : database used for search
301
sub database {shift->{'DATABASE'}}
306
Usage : $sbjct = $obj->nextSbjct();
307
Function : Method of iterating through all the Sbjct retrieved
308
from parsing the report
309
Example : while ( my $sbjct = $obj->nextSbjct ) {}
310
Returns : next Sbjct object or null if finished
318
$self->_fastForward or return;
320
#######################
321
# get all sbjct lines #
322
#######################
323
my $def = $self->_readline();
324
while(defined ($_ = $self->_readline() ) ) {
326
elsif (/Strand HSP/o) {next} # WU-BLAST non-data
327
elsif (/^\s{0,2}Score/o) {$self->_pushback($_); last}
328
elsif (/^Histogram|^Searching|^Parameters|
330
^\s+Posted date:/ox) {
331
$self->_pushback($_);
344
if( $def =~ s/Length = ([\d,]+)$//g ) {
347
return unless $def =~ /^>/;
353
my $sbjct = new Bio::Tools::BPlite::Sbjct('-name'=>$def,
359
# begin private routines
364
# normally, _parseHeader will break out of the parse as soon as it
365
# reaches a new Subject (i.e. the first one after the header) if you
366
# call _parseHeader twice in a row, with nothing in between, all you
367
# accomplish is a ->nextSubject call.. so we need a flag to
368
# indicate that we have *entered* a header, before we are allowed to
371
my $header_flag = 0; # here is the flag/ It is "false" at first, and
372
# is set to "true" when any valid header element
375
$self->{'REPORT_DONE'} = 0; # reset this bit for a new report
376
while(defined($_ = $self->_readline() ) ) {
378
if (/^Query=(?:\s+(.+))?/s) {
379
$header_flag = 1; # valid header element found
381
while( defined($_ = $self->_readline() ) ) {
382
# Continue reading query name until encountering either
383
# a line that starts with "Database" or a blank line.
384
# The latter condition is needed in order to be able to
385
# parse megablast output correctly, since Database comes
386
# before (not after) the query.
387
if( ($_ =~ /^Database/) || ($_ =~ /^$/) ) {
388
$self->_pushback($_); last;
397
if( $query =~ /\(([\d,]+)\s+\S+\)\s*$/ ) {
401
$self->debug("length is 0 for '$query'\n");
403
$self->{'QUERY'} = $query;
404
$self->{'LENGTH'} = $length;
406
elsif (/^(<b>)?(T?BLAST[NPX])\s+([\w\.-]+)\s+(\[[\w-]*\])/o) {
407
$self->{'BLAST_TYPE'} = $2;
408
$self->{'BLAST_VERSION'} = $3;
409
} # BLAST report type - not a valid header element # JB949
411
# Support Paracel BTK output
412
elsif ( $_ =~ /(^[A-Z0-9_]+)\s+BTK\s+/ ) {
413
$self->{'BLAST_TYPE'} = $1;
416
elsif ($_ =~ /^Database:\s+(.+)/) {$header_flag = 1;$self->{'DATABASE'} = $1} # valid header element found
417
elsif ($_ =~ /^\s*pattern\s+(\S+).*position\s+(\d+)\D/) {
418
# For PHIBLAST reports
419
$header_flag = 1; # valid header element found
420
$self->{'PATTERN'} = $1;
421
push (@{$self->{'QPATLOCATION'}}, $2);
423
elsif (($_ =~ /^>/) && ($header_flag==1)) {$self->_pushback($_); return 1} # only leave if we have actually parsed a valid header!
424
elsif (($_ =~ /^Parameters|^\s+Database:/) && ($header_flag==1)) {
425
# if we entered a header, and saw nothing before the stats at the end,
427
$self->_pushback($_);
428
return 0; # there's nothing in the report
429
} elsif( /Reference:\s+Aaron E\. Darling/ ) {
432
# bug fix suggested by MI Sadowski via Martin Lomas
433
# see bug report #1118
434
if( ref($self->_fh()) !~ /GLOB/ &&
435
$self->_fh()->can('EOF') && eof($self->_fh()) ) {
436
$self->warn("unexpected EOF in file\n");
445
return 0 if $self->{'REPORT_DONE'}; # empty report
447
while(defined( $_ = $self->_readline() ) ) {
448
if (/^Histogram|^Searching|^Parameters|^\s+Database:|
449
^\s+Posted date:/xo) {
451
} elsif( $self->{'BTK'} && /^BLAST/o ) {
454
$self->_pushback($_);
458
unless( $self->{'BTK'} ) { # Paracel BTK reports have no footer
459
$self->warn("Possible error (1) while parsing BLAST report!");