530
Usage : $hsp->frame($queryframe,$subjectframe)
542
Usage : my ($qframe, $hframe) = $hsp->frame('list',$queryframe,$subjectframe)
531
543
Function: Set the Frame for both query and subject and insure that
533
545
This overrides the frame() method implementation in
535
Returns : array of query and subjects if return type wants an array
536
or query frame if defined or subject frame
547
Returns : array of query and subject frame if return type wants an array
548
or query frame if defined or subject frame if not defined
549
Args : 'hit' or 'subject' or 'sbjct' to retrieve the frame of the subject (default)
550
'query' to retrieve the query frame
551
'list' or 'array' to retrieve both query and hit frames together
538
552
Note : Frames are stored in the GFF way (0-2) not 1-3
539
553
as they are in BLAST (negative frames are deduced by checking
540
554
the strand of the query or hit)
558
# Note: changed 4/19/08 - bug 2485
560
# frame() is supposed to be a getter/setter (as is implied by the Function desc
561
# above; this is also consistent with Bio::SeqFeature::SimilarityPair). Also,
562
# the API is not consistent with the other HSP/SimilarityPair methods such as
563
# strand(), start(), end(), etc.
565
# In order to make this consistent with other methods all work is now done
566
# when the features are instantiated and not delayed. We compromise by
567
# defaulting back 'to hit' w/o passed args. Setting is now allowed.
545
570
my $self = shift;
547
unless (defined $self->{_did_preframe}) {
550
my $qframe = $self->{QUERY_FRAME};
551
my $sframe = $self->{HIT_FRAME};
553
if( defined $qframe ) {
556
} elsif( $qframe !~ /^([+-])?([1-3])/ ) {
557
$self->warn("Specifying an invalid query frame ($qframe)");
561
$dir = '+' unless defined $dir;
562
if( ($dir eq '-' && $self->query->strand >= 0) ||
563
($dir eq '+' && $self->query->strand <= 0) ) {
564
$self->warn("Query frame ($qframe) did not match strand of query (". $self->query->strand() . ")");
566
# Set frame to GFF [0-2] -
567
# what if someone tries to put in a GFF frame!
570
$self->query->frame($qframe);
572
if( defined $sframe ) {
575
} elsif( $sframe !~ /^([+-])?([1-3])/ ) {
576
$self->warn("Specifying an invalid subject frame ($sframe)");
580
$dir = '+' unless defined $dir;
581
if( ($dir eq '-' && $self->hit->strand >= 0) ||
582
($dir eq '+' && $self->hit->strand <= 0) )
584
$self->warn("Subject frame ($sframe) did not match strand of subject (". $self->hit->strand() . ")");
587
# Set frame to GFF [0-2]
590
$self->hit->frame($sframe);
592
if (wantarray() && $self->algorithm =~ /^T(BLAST|FAST)(X|Y|XY)/oi)
594
return ($self->query->frame(), $self->hit->frame());
595
} elsif (wantarray()) {
596
($self->query->frame() &&
597
return ($self->query->frame(), undef)) ||
598
($self->hit->frame() &&
599
return (undef, $self->hit->frame()));
601
($self->query->frame() &&
602
return $self->query->frame()) ||
603
($self->hit->frame() &&
604
return $self->hit->frame());
573
# unfortunately, w/o args we need to warn about API changes
574
$self->warn("API for frame() has changed.\n".
575
"Please refer to documentation for Bio::Search::HSP::GenericHSP;\n".
576
"returning query frame");
581
if( $val =~ /^q/i ) {
582
return $self->query->frame(@_);
583
} elsif( $val =~ /^hi|^s/i ) {
584
return $self->hit->frame(@_);
585
} elsif ( $val =~ /^list|array/i ) {
586
return ($self->query->frame($_[0]),
587
$self->hit->frame($_[1]) );
588
} elsif ( $val =~ /^\d+$/) {
589
# old API i.e. frame($query_frame, $hit_frame). This catches all but one
590
# case, where no arg is passed (so the hit is wanted).
591
$self->warn("API for frame() has changed.\n".
592
"Please refer to documentation for Bio::Search::HSP::GenericHSP");
594
return ($self->query->frame($val),
595
$self->hit->frame(@_) ) :
596
return $self->hit->frame($val,@_);
598
$self->warn("unrecognized component '$val' requested\n");
731
733
: May include ranges if collapse is true.
732
734
Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = query)
733
735
: ('sbjct' is synonymous with 'hit')
734
: class = 'identical' or 'conserved' or 'nomatch' or 'gap'
735
: (default = identical)
736
: (can be shortened to 'id' or 'cons')
737
: or 'conserved-not-identical'
736
: class = 'identical' - identical positions
737
: 'conserved' - conserved positions
738
: 'nomatch' - mismatched residue or gap positions
739
: 'mismatch' - mismatched residue positions (no gaps)
740
: 'gap' - gap positions only
741
: 'frameshift'- frameshift positions only
742
: 'conserved-not-identical' - conserved positions w/o
744
: The name can be shortened to 'id' or 'cons' unless
745
: the name is ambiguous. The default value is
738
748
: collapse = boolean, if true, consecutive positions are merged
739
749
: using a range notation, e.g., "1 2 3 4 5 7 9 10 11"
740
750
: collapses to "1-5 7 9-11". This is useful for
741
751
: consolidating long lists. Default = no collapse.
754
Comments : For HSPs partially or completely derived from translated sequences
755
: (TBLASTN, BLASTX, TBLASTX, or similar), some positional data
756
: cannot easily be attributed to a single position (i.e. the
757
: positional data is ambiguous). In these cases all three codon
758
: positions are reported. Under these conditions you can check
759
: ambiguous_seq_inds() to determine whether the query, subject,
760
: or both are ambiguous.
745
762
See Also : L<Bio::Search::SearchUtils::collapse_nums()|Bio::Search::SearchUtils>,
746
763
L<Bio::Search::Hit::HitI::seq_inds()|Bio::Search::Hit::HitI>
794
816
if( $class eq '_gap' ) {
795
# this means that we are remapping the gap length that is stored
796
# in the hash (for example $self->{'_gapRes_query'} )
797
# so we'll return an array which has the values of the position of the
798
# of the gap (the key in the hash) + the gap length (value in the
799
# hash for this key - 1.
801
@ary = map { $_ > 1 ?
802
$_..($_ + $self->{"${class}Res$seqType"}->{$_} - 1) :
804
sort { $a <=> $b } keys %{ $self->{"${class}Res$seqType"}};
817
# this means that we are remapping the gap length that is stored
818
# in the hash (for example $self->{'_gapRes_query'} )
819
# so we'll return an array which has the values of the position of the
820
# of the gap (the key in the hash) + the gap length (value in the
821
# hash for this key - 1.
823
# changing this; since the index is the position prior to the insertion,
824
# repeat the position based on the number of gaps inserted
825
@ary = map { my @tmp;
826
# position holds number of gaps inserted
827
for my $g (1..$self->{seqinds}{"${class}Res$seqType"}->{$_}) {
831
sort { $a <=> $b } keys %{ $self->{seqinds}{"${class}Res$seqType"}};
805
832
} elsif( $class eq '_conservedall' ) {
806
833
@ary = sort { $a <=> $b }
807
keys %{ $self->{"_conservedRes$seqType"}},
808
keys %{ $self->{"_identicalRes$seqType"}},
834
keys %{ $self->{seqinds}{"_conservedRes$seqType"}},
835
keys %{ $self->{seqinds}{"_identicalRes$seqType"}},
810
@ary = sort { $a <=> $b } keys %{ $self->{"${class}Res$seqType"}};
837
@ary = sort { $a <=> $b } keys %{ $self->{seqinds}{"${class}Res$seqType"}};
812
839
require Bio::Search::BlastUtils if $collapse;
814
841
return $collapse ? &Bio::Search::SearchUtils::collapse_nums(@ary) : @ary;
844
=head2 ambiguous_seq_inds
846
Title : ambiguous_seq_inds
847
Purpose : returns a string denoting whether sequence indices for query,
848
: subject, or both are ambiguous
849
Returns : String; 'query', 'subject', 'query/subject', or empty string ''
851
Comments : For HSPs partially or completely derived from translated sequences
852
: (TBLASTN, BLASTX, TBLASTX, or similar), some positional data
853
: cannot easily be attributed to a single position (i.e. the
854
: positional data is ambiguous). In these cases all three codon
855
: positions are reported when using seq_inds(). Under these
856
: conditions you can check ambiguous_seq_inds() to determine whether
857
: the query, subject, or both are ambiguous.
858
See Also : L<Bio::Search::Hit::HSPI::seq_inds()>
862
sub ambiguous_seq_inds {
864
$self->_calculate_seq_positions();
865
return $self->{seqinds}{'_warnRes'} if exists $self->{seqinds}{'_warnRes'};
866
return $self->{seqinds}{'_warnRes'} = '';
818
869
=head2 Inherited from L<Bio::SeqFeature::SimilarityPair>
941
1004
my ($self,@args) = @_;
942
1005
return unless ( $self->{'_sequenceschanged'} );
943
1006
$self->{'_sequenceschanged'} = 0;
944
my ($mchar, $schar, $qchar);
945
1007
my ($seqString, $qseq,$sseq) = ( $self->homology_string(),
946
1008
$self->query_string(),
947
1009
$self->hit_string() );
949
# Using hashes to avoid saving duplicate residue numbers.
950
my %identicalList_query = ();
951
my %identicalList_sbjct = ();
952
my %conservedList_query = ();
953
my %conservedList_sbjct = ();
955
my %gapList_query = ();
956
my %gapList_sbjct = ();
957
my %nomatchList_query = ();
958
my %nomatchList_sbjct = ();
1010
my ($mlen, $qlen, $slen) = (CORE::length($seqString), CORE::length($qseq), CORE::length($sseq));
960
1011
my $qdir = $self->query->strand || 1;
961
1012
my $sdir = $self->hit->strand || 1;
962
my $resCount_query = ($qdir >=0) ? $self->query->end : $self->query->start;
963
my $resCount_sbjct = ($sdir >=0) ? $self->hit->end : $self->hit->start;
1013
my ($resCount_query, $endpoint_query) = ($qdir <=0) ? ($self->query->end, $self->query->start)
1014
: ($self->query->start, $self->query->end);
1015
my ($resCount_sbjct, $endpoint_sbjct) = ($sdir <=0) ? ($self->hit->end, $self->hit->start)
1016
: ($self->hit->start, $self->hit->end);
965
1018
my $prog = $self->algorithm;
966
1020
if( $prog =~ /FAST|SSEARCH|SMITH-WATERMAN/i ) {
1022
# we infer the end of the regional sequence where the first and last
1023
# non spaces are in the homology string
1024
# then we use the HSP->length to tell us how far to read
1025
# to cut off the end of the sequence
1027
my ($start, $rest) = (0,0);
1028
if( $seqString =~ /^(\s+)?(.*?)\s*$/ ) {
1029
($start, $rest) = ($1 ? CORE::length($1) : 0, CORE::length($2));
1032
$seqString = substr($seqString, $start, $rest);
1033
$qseq = substr($qseq, $start, $rest);
1034
$sseq = substr($sseq, $start, $rest);
1036
# commented out 10/29/08
1037
# removing frameshift symbols doesn't take into account the following:
1038
# 1) does not remove the same point in the homology string (get
1039
# positional errors)
1040
# 2) adjustments to the overall position in the string due to the
1041
# frameshift must be taken into consideration (get balancing errors)
1043
# Frameshifts will be handled directly in the main loop.
967
1046
# fasta reports some extra 'regional' sequence information
968
1047
# we need to clear out first
969
1048
# this seemed a bit insane to me at first, but it appears to
972
# we infer the end of the regional sequence where the first
973
# non space is in the homology string
974
# then we use the HSP->length to tell us how far to read
975
# to cut off the end of the sequence
977
# one possible problem is the sequence which
980
if( $seqString =~ /^(\s+)/ ) {
981
$start = CORE::length($1);
1051
#$qseq =~ s![\\\/]!!g;
1052
#$sseq =~ s![\\\/]!!g;
1055
$self->{seqinds}{'_warnRes'} = '';
1056
($self->{_sbjct_offset}, $self->{_query_offset}) = (1,1);
1057
if($prog =~ /^(?:PSI)?T(BLAST|FAST)(N|X|Y)/oi ) {
1058
$self->{_sbjct_offset} = 3;
1059
if ($1 eq 'BLAST' && $2 eq 'X') { #TBLASTX
1060
$self->{_query_offset} = 3;
1061
$self->{seqinds}{'_warnRes'} = 'query/subject';
1063
$self->{seqinds}{'_warnRes'} = 'subject';
984
$seqString = substr($seqString, $start,$self->length('total'));
985
$qseq = substr($qseq, $start,$self->length('total'));
986
$sseq = substr($sseq, $start,$self->length('total'));
988
$qseq =~ s![\\\/]!!g;
989
$sseq =~ s![\\\/]!!g;
992
if($prog =~ /^(PSI)?T(BLAST|FAST)N/oi ) {
993
$resCount_sbjct = int($resCount_sbjct / 3);
994
1065
} elsif($prog =~ /^(BLAST|FAST)(X|Y|XY)/oi ) {
995
$resCount_query = int($resCount_query / 3);
996
} elsif($prog =~ /^T(BLAST|FAST)(X|Y|XY)/oi ) {
997
$resCount_query = int($resCount_query / 3);
998
$resCount_sbjct = int($resCount_sbjct / 3);
1000
while( $mchar = chop($seqString) ) {
1001
($qchar, $schar) = (chop($qseq), chop($sseq));
1002
if( $mchar eq '+' || $mchar eq '.' || $mchar eq ':' ) {
1003
$conservedList_query{ $resCount_query } = 1;
1004
$conservedList_sbjct{ $resCount_sbjct } = 1;
1005
} elsif( $mchar ne ' ' ) {
1006
$identicalList_query{ $resCount_query } = 1;
1007
$identicalList_sbjct{ $resCount_sbjct } = 1;
1008
} elsif( $mchar eq ' ') {
1009
$nomatchList_query{ $resCount_query } = 1;
1010
$nomatchList_sbjct{ $resCount_sbjct } = 1;
1012
if( $qchar eq $GAP_SYMBOL ) {
1013
$gapList_query{ $resCount_query } ++;
1015
$resCount_query -= $qdir;
1017
if( $schar eq $GAP_SYMBOL ) {
1018
$gapList_sbjct{ $resCount_query } ++;
1020
$resCount_sbjct -=$sdir;
1023
$self->{'_identicalRes_query'} = \%identicalList_query;
1024
$self->{'_conservedRes_query'} = \%conservedList_query;
1025
$self->{'_nomatchRes_query'} = \%nomatchList_query;
1026
$self->{'_gapRes_query'} = \%gapList_query;
1028
$self->{'_identicalRes_sbjct'} = \%identicalList_sbjct;
1029
$self->{'_conservedRes_sbjct'} = \%conservedList_sbjct;
1030
$self->{'_nomatchRes_sbjct'} = \%nomatchList_sbjct;
1031
$self->{'_gapRes_sbjct'} = \%gapList_sbjct;
1066
$self->{_query_offset} = 3;
1067
$self->{seqinds}{'_warnRes'} = 'query';
1069
my ($qfs, $sfs) = (0,0);
1071
for my $pos (0..CORE::length($seqString)-1) {
1072
my @qrange = (0 - $qfs)..($self->{_query_offset} - 1);
1073
my @srange = (0 - $sfs)..($self->{_sbjct_offset} - 1);
1074
# $self->debug("QRange:@qrange SRange:@srange\n") if ($qfs || $sfs);
1075
($qfs, $sfs) = (0,0);
1076
my ($mchar, $qchar, $schar) = (
1077
unpack("x$pos A1",$seqString) || ' ',
1078
$pos < CORE::length($qseq) ? unpack("x$pos A1",$qseq) : '-',
1079
$pos < CORE::length($sseq) ? unpack("x$pos A1",$sseq) : '-'
1082
my ($qgap, $sgap) = (0,0);
1083
if( $mchar eq '+' || $mchar eq '.') { # conserved
1084
$self->{seqinds}{_conservedRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1085
$self->{seqinds}{_conservedRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1086
$matchtype = 'conserved';
1087
} elsif( $mchar eq ':' || $mchar ne ' ' ) { # identical
1088
$self->{seqinds}{_identicalRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1089
$self->{seqinds}{_identicalRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1090
$matchtype = 'identical';
1091
} elsif( $mchar eq ' ' ) { # possible mismatch/nomatch/frameshift
1092
$qfs = $qchar eq '/' ? -1 : # base inserted to match frame
1093
$qchar eq '\\' ? 1 : # base deleted to match frame
1095
$sfs = $schar eq '/' ? -1 :
1096
$schar eq '\\' ? 1 :
1099
# Frameshifts are tricky.
1101
# '/' indicates that the next residue is shifted back one
1102
# (-1) frame position and is a deletion of one base (so to
1103
# correctly align, a base is inserted). That residue should only
1104
# occupy two sequence positions instead of three.
1106
# '\' indicates that the next residue is shifted forward
1107
# one (+1) frame position and is an insertion of one base (so to
1108
# correctly align, a base is removed). That residue should
1109
# occupy four sequence positions instead of three.
1111
# Note that gaps are not counted across from frameshifts.
1112
# Frameshift indices are a range of positions starting in the
1113
# previous sequence position in which the frameshift occurs;
1114
# they are ambiguous by nature.
1115
$self->{seqinds}{_frameshiftRes_query}{ $resCount_query - ($_ * $qdir * $qfs) } = $qfs for @qrange;
1116
$matchtype = "$qfs frameshift-query";
1120
$self->{seqinds}{_frameshiftRes_sbjct}{ $resCount_sbjct - ($_ * $sdir * $sfs) } = $sfs for @srange;
1121
$matchtype = "$sfs frameshift-sbcjt";
1124
elsif ($qchar eq $self->{GAP_SYMBOL}) {
1125
# gap are counted as being in the immediately preceeding residue
1126
# position; for translated sequences this is not the start of
1127
# the previous codon, but the third codon position
1128
$self->{seqinds}{_gapRes_query}{ $resCount_query - $qdir }++ for @qrange;
1129
$matchtype = 'gap-query';
1132
elsif ($schar eq $self->{GAP_SYMBOL}) {
1133
$self->{seqinds}{_gapRes_sbjct}{ $resCount_sbjct - $sdir }++ for @srange;
1134
$matchtype = 'gap-sbjct';
1138
# if not a gap or frameshift in either seq, must be mismatch
1139
$self->{seqinds}{_mismatchRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1140
$self->{seqinds}{_mismatchRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1141
$matchtype = 'mismatch';
1143
# always add a nomatch unless the current position in the seq is a gap
1145
$self->{seqinds}{_nomatchRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1148
$self->{seqinds}{_nomatchRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1151
$self->warn("Unknown midline character: [$mchar]");
1153
# leave in and uncomment for future debugging
1154
#$self->debug(sprintf("%7d %1s[%1s]%1s %-7d Type: %-20s QOff:%-2d SOff:%-2d\n",
1161
# ($self->{_query_offset} * $qdir),
1162
# ($self->{_sbjct_offset} * $sdir)));
1163
$resCount_query += ($qdir * (scalar(@qrange) + $qfs)) if !$qgap;
1164
$resCount_sbjct += ($sdir * (scalar(@srange) + $sfs)) if !$sgap;