1
# $Id: bioperl.lisp 15559 2009-02-23 12:11:20Z maj $
3
# BioPerl module for Bio::SearchIO::hmmer3
5
# Please direct questions and support issues to <bioperl-l@bioperl.org>
7
# Cared for by Thomas Sharpton <thomas.sharpton@gmail.com>
9
# Copyright Thomas Sharpton
11
# You may distribute this module under the same terms as perl itself
13
# POD documentation - main docs before the code
17
Bio::SearchIO::hmmer3 - DESCRIPTION of Object
21
Give standard usage here
25
Describe the object here
31
User feedback is an integral part of the evolution of this and other
32
Bioperl modules. Send your comments and suggestions preferably to
33
the Bioperl mailing list. Your participation is much appreciated.
35
bioperl-l@bioperl.org - General discussion
36
http://bioperl.org/wiki/Mailing_lists - About the mailing lists
40
Please direct usage questions or support issues to the mailing list:
42
L<bioperl-l@bioperl.org>
44
rather than to the module maintainer directly. Many experienced and
45
reponsive experts will be able look at the problem and quickly
46
address it. Please include a thorough description of the problem
47
with code and data examples if at all possible.
51
Report bugs to the Bioperl bug tracking system to help us keep track
52
of the bugs and their resolution. Bug reports can be submitted via
55
https://redmine.open-bio.org/projects/bioperl/
57
=head1 AUTHOR - Thomas Sharpton
59
Email thomas.sharpton@gmail.com
61
Describe contact details here
65
Additional contributors names and emails here
69
The rest of the documentation details each of the object methods.
70
Internal methods are usually preceded with a _
75
# Let the code begin...
77
package Bio::SearchIO::hmmer3;
81
use Bio::Factory::ObjectFactory;
82
use vars qw(%MAPPING %MODEMAP);
83
use base qw(Bio::SearchIO::hmmer);
87
# mapping of HMMER items to Bioperl hash keys
89
'HMMER_Output' => 'result',
95
'Hsp_bit-score' => 'HSP-bits',
96
'Hsp_score' => 'HSP-score',
97
'Hsp_evalue' => 'HSP-evalue',
98
'Hsp_query-from' => 'HSP-query_start',
99
'Hsp_query-to' => 'HSP-query_end',
100
'Hsp_hit-from' => 'HSP-hit_start',
101
'Hsp_hit-to' => 'HSP-hit_end',
102
'Hsp_positive' => 'HSP-conserved',
103
'Hsp_identity' => 'HSP-identical',
104
'Hsp_gaps' => 'HSP-hsp_gaps',
105
'Hsp_hitgaps' => 'HSP-hit_gaps',
106
'Hsp_querygaps' => 'HSP-query_gaps',
107
'Hsp_qseq' => 'HSP-query_seq',
108
'Hsp_hseq' => 'HSP-hit_seq',
109
'Hsp_midline' => 'HSP-homology_seq',
110
'Hsp_align-len' => 'HSP-hsp_length',
111
'Hsp_query-frame' => 'HSP-query_frame',
112
'Hsp_hit-frame' => 'HSP-hit_frame',
114
'Hit_id' => 'HIT-name',
115
'Hit_len' => 'HIT-length',
116
'Hit_accession' => 'HIT-accession',
117
'Hit_desc' => 'HIT-description',
118
'Hit_signif' => 'HIT-significance',
119
'Hit_score' => 'HIT-score',
121
'HMMER_program' => 'RESULT-algorithm_name',
122
'HMMER_version' => 'RESULT-algorithm_version',
123
'HMMER_query-def' => 'RESULT-query_name',
124
'HMMER_query-len' => 'RESULT-query_length',
125
'HMMER_query-acc' => 'RESULT-query_accession',
126
'HMMER_querydesc' => 'RESULT-query_description',
127
'HMMER_hmm' => 'RESULT-hmm_name',
128
'HMMER_seqfile' => 'RESULT-sequence_file',
129
'HMMER_db' => 'RESULT-database_name',
136
Usage : my $obj = new Bio::SearchIO::Hmmer3->new();
137
Function: Builds a new Bio::SearchIO::Hmmer3 object
138
Returns : an instance of Bio::SearchIO::Hmmer3
139
Args : -fh/-file => HMMER filename
145
my( $self,@args ) = @_;
146
$self->SUPER::_initialize(@args);
147
$self->{'_hmmidline'} = 'HMMER 3.0b placeholder';
148
$self->{'_alnreport'} = 1; #does report include alignments
154
Usage : my $hit = $searchio->next_result;
155
Function: Returns the next Result from a search
156
Returns : Bio::Search::Result::ResultI object
163
my $seentop = 0; #Placeholder for when we deal with multi-query reports
165
my ( $last, @hit_list, @hsp_list, %hspinfo, %hitinfo, %domaincounter );
169
my $verbose = $self->verbose; # cache for speed? JES's idea in hmmer.pm
170
$self->start_document();
172
#This is here to ensure that next_result doesn't produce infinite loop
173
if(!defined( $_ = $self->_readline) ) {
177
$self->_pushback($_);
179
#Regex goes here for HMMER3
180
#Start with hmmsearch processing
181
while ( defined( $_ = $self->_readline ) ) {
184
#Grab the program name.
185
if ( $_ =~ m/^\#\s(\S+)\s\:\:\s/ ){
187
#TO DO LATER: customize the above regex to adapt to other
188
#program types!!! (hmmscan, etc)
189
$self->start_element( { 'Name' => 'HMMER_Output' } );
190
$self->{'_result_count'}++; #Might need to move to another block
193
'Name' => 'HMMER_program',
198
#Get the HMMER package version and release date
199
elsif ( $_ =~ m/^\#\sHMMER\s+(\S+)\s+\((.+)\)/ ) {
201
my $versiondate = $2;
202
$self->{'_hmmidline'} = $_;
205
'Name' => 'HMMER_version',
211
elsif( $_ =~ /^\#\squery \w+ file\:\s+(\S+)/ ){
212
if( $self->{'_reporttype'} eq 'HMMSEARCH') {
213
$self->{'_hmmfileline'} = $lineorig;
216
'Name' => 'HMMER_hmm',
221
elsif( $self->{'_reporttype'} eq 'HMMSCAN' ) {
222
$self->{'_hmmseqline'} = $lineorig;
225
'Name' => 'HMMER_seqfile',
231
#If this is a report without alignments
232
elsif( $_ =~ m/^\#\sshow\salignments\sin\soutput/ ){
233
$self->{'_alnreport'} = 0;
235
#Get the database info
236
elsif( $_ =~ m/^\#\starget\s\S+\sdatabase\:\s+(\S+)/ ){
237
# $self->{'_hmmseqline'} = $lineorig;
240
# 'Name' => 'HMMER_seqfile',
244
if( $self->{'_reporttype'} eq 'HMMSEARCH') {
245
$self->{'_hmmseqline'} = $lineorig;
248
'Name' => 'HMMER_seqfile',
253
elsif( $self->{'_reporttype'} eq 'HMMSCAN' ) {
254
$self->{'_hmmfileline'} = $lineorig;
257
'Name' => 'HMMER_hmm',
264
elsif( $_ =~ s/^Query:\s+// ) {
265
#TODO Code to deal with multi query report
266
unless( s/\s+\[[L|M]\=(\d+)\]$// ){
267
warn "Error parsing length for query, offending line $_\n";
273
'Name' => 'HMMER_query-len',
279
'Name' => 'HMMER_query-def',
285
elsif( $_ =~ s/^Accession:\s+// ){
289
'Name' => 'HMMER_query-acc',
294
#Get description data
295
elsif( $_ =~ s/^Description:\s+// ){
299
'Name' => 'HMMER_querydesc',
304
#PROCESS HMMSEARCH AND HMMSCAN RESULTS SPECIFIC FORMATTING HERE
305
elsif( defined $self->{'_reporttype'} &&
307
$self->{'_reporttype'} eq 'HMMSEARCH' ||
308
$self->{'_reporttype'} eq 'HMMSCAN'
311
#Complete sequence table data above inclusion threshold
312
if( $_ =~ m/Scores for complete sequence/){
313
while (defined( $_ = $self->_readline ) ) {
314
if ($_ =~ m/inclusion threshold/ || m/Domain( and alignment)? annotation for each/ ||
315
m/\[No hits detected/ || m!^//! ){
316
$self->_pushback($_);
320
next if ( m/\-\-\-/ || m/^\s+E\-value\s+score/ || m/^$/);
322
$eval_full, $score_full, $bias_full,
323
$eval_best, $score_best, $bias_best,
324
$exp, $n, $hitid, $desc, @hitline
326
@hitline = split(" ", $_);
327
$eval_full = shift @hitline;
328
$score_full = shift @hitline;
329
$bias_full = shift @hitline;
330
$eval_best = shift @hitline;
331
$score_best = shift @hitline;
332
$bias_best = shift @hitline;
333
$exp = shift @hitline;
335
$hitid = shift @hitline;
336
$desc = join " ", @hitline;
338
if( !defined( $desc ) ){
341
push @hit_list, [ $hitid, $desc, $eval_full, $score_full ];
342
$hitinfo{$hitid}= $#hit_list;
345
#Complete sequence table data below inclusion threshold
346
#not currently fully implemented
347
elsif( $_ =~ m/inclusion threshold/ ){
348
while( defined( $_ = $self->_readline ) ) {
349
if( $_ =~ m/Domain( and alignment)? annotation for each/ ||
350
m/Internal pipeline statistics summary/ ){
351
$self->_pushback($_);
354
next if( $_ =~ m/^$/ );
356
$eval_full, $score_full, $bias_full,
357
$eval_best, $score_best, $bias_best,
358
$exp, $n, $hitid, $desc, @hitline
360
@hitline = split(" ", $_);
361
$eval_full = shift @hitline;
362
$score_full = shift @hitline;
363
$bias_full = shift @hitline;
364
$eval_best = shift @hitline;
365
$score_best = shift @hitline;
366
$bias_best = shift @hitline;
367
$exp = shift @hitline;
369
$hitid = shift @hitline;
370
$desc = join " ", @hitline;
372
$hitinfo{$hitid} = "below_inclusion";
375
#Domain annotation for each sequence table data
376
elsif( $_ =~ m/Domain( and alignment)? annotation for each/){
377
@hsp_list = (); #here for multi-query reports
380
while( defined( $_ = $self->_readline ) ) {
381
if ($_ =~ m/Internal pipeline statistics/ || m/\[No targets detected/ ){
382
$self->_pushback($_);
385
if( $_ =~ m/^\>\>\s(.*?)\s+/ ) {
387
#skip hits below inclusion threshold
388
next if( $hitinfo{$name} eq "below_inclusion");
389
$domaincounter{$name} = 0;
391
while( defined( $_ = $self->_readline ) ) {
392
#grab table data for sequence
393
if ($_ =~ m/Internal pipeline statistics/ ||
395
$self->_pushback($_);
398
if ( $_ =~ m/Alignments for each domain/ ) {
399
$self->_pushback($_);
402
if ( $_ =~ m/^\s+\#\s+score/ ||
403
$_ =~ m/^\s\-\-\-\s+/ ||
409
# grab hsp data from table, push into @hsp;
411
my ($domain_num, $score, $bias, $ceval,
412
$ieval, $hmmstart, $hmmstop,
413
$qalistart, $qalistop, $envstart,
414
$envstop, $envbound, $acc) =
415
m!^\s+(\d+)\s\!*\?*\s+ #domain num
416
(\S+)\s+(\S+)\s+ #score, bias
417
(\S+)\s+(\S+)\s+ #c-eval, i-eval
418
(\d+)\s+(\d+).+? #hmm start, stop
419
(\d+)\s+(\d+).+? #query start, stop
420
(\d+)\s+(\d+).+? #env start, stop
424
#keeping simple for now. let's customize later
425
my @vals = ($hmmstart, $hmmstop, $qalistart, $qalistop, $score, $ceval, '', '', '');
426
my $info = $hit_list[ $hitinfo{$name} ];
427
if( !defined $info ){
429
"Incomplete sequence information; can't find $name, hitinfo says $hitinfo{$name}\n"
433
$domaincounter{$name}++;
434
my $hsp_key = $name . "_" . $domaincounter{$name};
435
push @hsp_list, [ $name, @vals ];
436
$hspinfo{$hsp_key} = $#hsp_list;
439
print "missed this line: $_\n";
443
elsif ($_ =~ m/Alignments for each domain/ ) {
444
my $domain_count = 0;
447
# There's an optional block, so we sometimes need to
448
# count to 3, and sometimes to 4.
452
my ($hline, $midline, $qline);
454
while( defined( $_ = $self->_readline ) ) {
455
if( $_ =~ m/^\>\>/ ||
456
$_ =~ m/Internal pipeline statistics/){
457
$self->_pushback($_);
460
elsif( $hitinfo{$name} eq "below_inclusion" ||
464
elsif( $_ =~ /\s\s\=\=\sdomain\s(\d+)\s+/){
467
my $key = $name . "_" . $domainnum;
468
$hsp = $hsp_list[ $hspinfo{$key} ];
470
$midline = $$hsp[-2];
474
# model data track, some reports don't have
475
elsif( $_ =~ m/\s+\S+\sCS$/ ){
481
elsif( $count == $max_count - 3 ){
483
my @data = split(" ", $_);
489
elsif( $count == $max_count - 2 ){
491
#storage isn't quite right - need to remove
492
#leading/lagging whitespace while preserving
493
#gap data (latter isn't done, former is)
500
elsif( $count == $max_count - 1 ){
502
my @data = split(" ", $_);
508
elsif( $count == $max_count ){
514
$$hsp[-2] = $midline;
525
elsif( m/Internal pipeline statistics/ || m!^//! ){
526
# if within hit, hsp close;
527
if ( $self->within_element('hit') ) {
528
if ( $self->within_element('hsp') ) {
529
$self->end_element( { 'Name' => 'Hsp' } );
531
$self->end_element( { 'Name' => 'Hit' } );
533
#grab summary statistics of run
534
while( defined( $_ = $self->_readline ) ) {
535
last if ( $_ =~ m/^\/\/$/ );
538
#Jason does a lot of processing of hits/hsps here;
539
while( my $hit = shift @hit_list ) {
540
my $hit_name = shift @$hit;
541
my $hit_desc = shift @$hit;
542
my $hit_signif = shift @$hit;
543
my $hit_score = shift @$hit;
544
my $num_domains = $domaincounter{$hit_name} || 0;
546
$self->start_element( { 'Name' => 'Hit' } );
555
'Name' => 'Hit_desc',
561
'Name' => 'Hit_signif',
562
'Data' => $hit_signif
567
'Name' => 'Hit_score',
571
for my $i (1..$num_domains) {
572
my $key = $hit_name . "_" . $i;
573
my $hsp = $hsp_list[ $hspinfo{$key} ];
575
my $hsp_name = shift @$hsp;
576
$self->start_element( { 'Name' => 'Hsp' } );
578
'Name' => 'Hsp_identity',
582
'Name' => 'Hsp_positive',
586
'Name' => 'Hsp_hit-from',
587
'Data' => shift @$hsp
590
'Name' => 'Hsp_hit-to',
591
'Data' => shift @$hsp
594
'Name' => 'Hsp_query-from',
595
'Data' => shift @$hsp
598
'Name' => 'Hsp_query-to',
599
'Data' => shift @$hsp
602
'Name' => 'Hsp_score',
603
'Data' => shift @$hsp
606
'Name' => 'Hsp_evalue',
607
'Data' => shift @$hsp
610
'Name' => 'Hsp_hseq',
611
'Data' => shift @$hsp
614
'Name' => 'Hsp_midline',
615
'Data' => shift @$hsp
618
'Name' => 'Hsp_qseq',
619
'Data' => shift @$hsp
621
$self->end_element( { 'Name' => 'Hsp' } );
624
$self->end_element( { 'Name' => 'Hit' } );
632
print "missed: $_\n";
637
$self->end_element( { 'Name' => 'HMMER_Output' } );
638
my $result = $self->end_document();
646
Title : start_element
647
Usage : $eventgenerator->start_element
648
Function: Handles a start event
650
Args : hashref with at least 2 keys 'Data' and 'Name'
656
my ( $self, $data ) = @_;
658
# we currently don't care about attributes
659
my $nm = $data->{'Name'};
660
my $type = $MODEMAP{$nm};
662
if ( $self->_eventHandler->will_handle($type) ) {
663
my $func = sprintf( "start_%s", lc $type );
664
$self->_eventHandler->$func( $data->{'Attributes'} );
666
unshift @{ $self->{'_elements'} }, $type;
669
&& $type eq 'result' )
671
$self->{'_values'} = {};
672
$self->{'_result'} = undef;
679
Usage : $eventgeneartor->end_element
680
Function: Handles and end element event
682
Args : hashref with at least 2 keys 'Data' and 'Name'
688
my ( $self, $data ) = @_;
689
my $nm = $data->{'Name'};
690
my $type = $MODEMAP{$nm};
693
if ( $nm eq 'HMMER_program' ) {
694
if ( $self->{'_last_data'} =~ /(HMM\S+)/i ) {
695
$self->{'_reporttype'} = uc $1;
699
# Hsp are sort of weird, in that they end when another
700
# object begins so have to detect this in end_element for now
701
if ( $nm eq 'Hsp' ) {
702
foreach (qw(Hsp_qseq Hsp_midline Hsp_hseq)) {
703
my $data = $self->{'_last_hspdata'}->{$_};
704
if ($data && $_ eq 'Hsp_hseq') {
705
# replace hmm '.' gap symbol by '-'
715
$self->{'_last_hspdata'} = {};
718
if ( $self->_eventHandler->will_handle($type) ) {
719
my $func = sprintf( "end_%s", lc $type );
720
$rc = $self->_eventHandler->$func( $self->{'_reporttype'},
721
$self->{'_values'} );
723
my $lastelem = shift @{ $self->{'_elements'} };
725
elsif ( $MAPPING{$nm} ) {
726
if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
727
my $key = ( keys %{ $MAPPING{$nm} } )[0];
728
$self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
729
$self->{'_last_data'};
732
$self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
733
# print "lastdata is " . $self->{'_last_data'} . "\n";
737
$self->debug("unknown nm $nm, ignoring\n");
739
$self->{'_last_data'} = ''; # remove read data if we are at
741
$self->{'_result'} = $rc if ( defined $type && $type eq 'result' );
748
Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
749
Function: Convienence method that calls start_element, characters, end_element
751
Args : Hash ref with the keys 'Name' and 'Data'
756
my ( $self, $data ) = @_;
757
$self->start_element($data);
758
$self->characters($data);
759
$self->end_element($data);
765
Usage : $eventgenerator->characters($str)
766
Function: Send a character events
773
my ( $self, $data ) = @_;
775
if ( $self->in_element('hsp')
776
&& $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/o
777
&& defined $data->{'Data'} )
779
$self->{'_last_hspdata'}->{ $data->{'Name'} } .= $data->{'Data'};
781
return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o );
783
$self->{'_last_data'} = $data->{'Data'};
786
=head2 within_element
788
Title : within_element
789
Usage : if( $eventgenerator->within_element( $element ) ) {}
790
Function: Test if we are within a particular element
791
This is different than 'in' because within can be tested for
794
Args : string element name
799
my ( $self, $name ) = @_;
802
|| !defined $self->{'_elements'}
803
|| scalar @{ $self->{'_elements'} } == 0 );
804
foreach ( @{ $self->{'_elements'} } ) {
805
return 1 if ( $_ eq $name );
813
Usage : if( $eventgenerator->in_element( $element ) ) {}
814
Function: Test if we are in a particular element
815
This is different than 'within' because 'in' only
816
tests its immediate parent
818
Args : string element name
823
my ( $self, $name ) = @_;
824
return 0 if !defined $self->{'_elements'}->[0];
825
return ( $self->{'_elements'}->[0] eq $name );
828
=head2 start_document
830
Title : start_document
831
Usage : $eventgenerator->start_document
832
Function: Handle a start document event
840
$self->{'_lasttype'} = '';
841
$self->{'_values'} = {};
842
$self->{'_result'} = undef;
843
$self->{'_elements'} = [];
849
Usage : $eventgenerator->end_document
850
Function: Handles an end document event
851
Returns : Bio::Search::Result::ResultI object
858
return $self->{'_result'};
864
Usage : my $count = $searchio->result_count
865
Function: Returns the number of results processed
873
return $self->{'_result_count'};