1
# $Id: ProteinStatistics.pm,v 1.7.4.1 2006/10/02 23:10:12 sendu Exp $
3
# BioPerl module for Bio::Align::ProteinStatistics
5
# Cared for by Jason Stajich <jason-at-bioperl.org>
7
# Copyright Jason Stajich
9
# You may distribute this module under the same terms as perl itself
11
# POD documentation - main docs before the code
15
Bio::Align::ProteinStatistics - Calculate Protein Alignment statistics (mostly distances)
19
use Bio::Align::ProteinStatistics;
21
my $in = new Bio::AlignIO(-format => 'fasta',
22
-file => 'pep-104.fasaln');
23
my $aln = $in->next_aln;
25
my $pepstats = Bio::Align::ProteinStatistics->new();
26
$kimura = $protstats->distance(-align => $aln,
28
print $kimura->print_matrix;
33
This object is for generating various statistics from a protein
34
alignment. Mostly it is where pairwise protein distances can be
39
D_Kimura - Kimura, M. 1983. The Neutral Theory of Molecular Evolution. CUP,
46
User feedback is an integral part of the evolution of this and other
47
Bioperl modules. Send your comments and suggestions preferably to
48
the Bioperl mailing list. Your participation is much appreciated.
50
bioperl-l@bioperl.org - General discussion
51
http://bioperl.org/wiki/Mailing_lists - About the mailing lists
55
Report bugs to the Bioperl bug tracking system to help us keep track
56
of the bugs and their resolution. Bug reports can be submitted via the
59
http://bugzilla.open-bio.org/
61
=head1 AUTHOR - Jason Stajich
63
Email jason-at-bioperl.org
67
The rest of the documentation details each of the object methods.
68
Internal methods are usually preceded with a _
73
# Let the code begin...
76
package Bio::Align::ProteinStatistics;
77
use vars qw(%DistanceMethods $Precision $DefaultGapPenalty);
80
use Bio::Align::PairwiseStatistics;
81
use Bio::Matrix::PhylipDist;
83
%DistanceMethods = ('kimura|k' => 'Kimura',
86
$DefaultGapPenalty = 0;
88
use base qw(Bio::Root::Root Bio::Align::StatisticsI);
93
Usage : my $obj = new Bio::Align::ProteinStatistics();
94
Function: Builds a new Bio::Align::ProteinStatistics object
95
Returns : an instance of Bio::Align::ProteinStatistics
102
my($class,@args) = @_;
104
my $self = $class->SUPER::new(@args);
105
$self->pairwise_stats( new Bio::Align::PairwiseStatistics());
113
Usage : my $distance_mat = $stats->distance(-align => $aln,
115
Function: Calculates a distance matrix for all pairwise distances of
116
sequences in an alignment.
117
Returns : L<Bio::Matrix::PhylipDist> object
118
Args : -align => Bio::Align::AlignI object
119
-method => String specifying specific distance method
120
(implementing class may assume a default)
125
my ($self,@args) = @_;
126
my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args);
127
if( ! defined $aln || ! ref ($aln) || ! $aln->isa('Bio::Align::AlignI') ) {
128
$self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance");
130
$method ||= 'Kimura';
131
foreach my $m ( keys %DistanceMethods ) {
132
if(defined $m && $method =~ /$m/i ) {
133
my $mtd = "D_$DistanceMethods{$m}";
134
return $self->$mtd($aln);
137
$self->warn("Unrecognized distance method $method must be one of [".
138
join(',',$self->available_distance_methods())."]");
142
=head2 available_distance_methods
144
Title : available_distance_methods
145
Usage : my @methods = $stats->available_distance_methods();
146
Function: Enumerates the possible distance methods
147
Returns : Array of strings
153
sub available_distance_methods{
154
my ($self,@args) = @_;
155
return values %DistanceMethods;
158
=head2 D - distance methods
167
Usage : my $matrix = $pepstats->D_Kimura($aln);
168
Function: Calculate Kimura protein distance (Kimura 1983) which
169
approximates PAM distance
170
D = -ln ( 1 - p - 0.2 * p^2 )
171
Returns : L<Bio::Matrix::PhylipDist>
172
Args : L<Bio::Align::AlignI>
177
# Kimura, M. 1983. The Neutral Theory of Molecular Evolution. CUP, Cambridge.
180
my ($self,$aln) = @_;
181
return 0 unless $self->_check_arg($aln);
182
# ambiguities ignored at this point
183
my (@seqs,@names,@values,%dist);
185
foreach my $seq ( $aln->each_seq) {
186
push @names, $seq->display_id;
187
push @seqs, uc $seq->seq();
190
my $len = $aln->length;
191
my $precisionstr = "%.$Precision"."f";
193
for( my $i = 0; $i < $seqct-1; $i++ ) {
194
# (diagonals) distance is 0 for same sequence
195
$dist{$names[$i]}->{$names[$i]} = [$i,$i];
196
$values[$i][$i] = sprintf($precisionstr,0);
197
for( my $j = $i+1; $j < $seqct; $j++ ) {
198
my ($scored,$match) = (0,0);
199
for( my $k=0; $k < $len; $k++ ) {
200
my $m1 = substr($seqs[$i],$k,1);
201
my $m2 = substr($seqs[$j],$k,1);
202
if( $m1 ne '-' && $m2 ne '-' ) {
203
# score is number of scored bases (alignable bases)
204
# it could have also come from
205
# my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
206
# match is number of matches weighting ambiguity bases
208
$match += _check_ambiguity_protein($m1,$m2);
212
# From Felsenstein's PHYLIP documentation:
213
# This is very quick to do but has some obvious
214
# limitations. It does not take into account which amino
215
# acids differ or to what amino acids they change, so some
216
# information is lost. The units of the distance measure
217
# are fraction of amino acids differing, as also in the
218
# case of the PAM distance. If the fraction of amino acids
219
# differing gets larger than 0.8541 the distance becomes
222
my $D = 1 - ( $match / $scored );
224
$D = - log ( 1 - $D - (0.2 * ($D ** 2)));
225
$values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$D);
227
$values[$j][$i] = $values[$i][$j] = ' NaN';
230
$dist{$names[$i]}->{$names[$j]} = [$i,$j];
231
$dist{$names[$j]}->{$names[$i]} = [$i,$j];
233
# (diagonals) distance is 0 for same sequence
234
$dist{$names[$j]}->{$names[$j]} = [$j,$j];
235
$values[$j][$j] = sprintf($precisionstr,0);
239
return Bio::Matrix::PhylipDist->new(-program => 'bioperl_PEPstats',
242
-values => \@values);
246
# some methods from EMBOSS distmat
247
sub _check_ambiguity_protein
252
if( $t1 ne 'X' && $t1 eq $t2 ) {
254
} elsif( ((($t1 eq 'B' && $t2 eq 'DN') ||
255
($t2 eq 'B' && $t2 eq 'DN'))) ||
257
( ($t1 eq 'Z' && $t2 eq 'EQ') ||
258
($t2 eq 'Z' && $t1 eq 'EQ'))) {
260
} elsif ( $t1 eq 'X' && $t2 eq 'X' ) {
262
} elsif( $t1 eq 'X' || $t2 eq 'X' ) {
272
=head2 pairwise_stats
274
Title : pairwise_stats
275
Usage : $obj->pairwise_stats($newval)
277
Returns : value of pairwise_stats
278
Args : newvalue (optional)
284
my ($self,$value) = @_;
285
if( defined $value) {
286
$self->{'_pairwise_stats'} = $value;
288
return $self->{'_pairwise_stats'};
293
my($self,$aln ) = @_;
294
if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
295
$self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics");
297
} elsif( $aln->get_seq_by_pos(1)->alphabet ne 'protein' ) {
298
$self->warn("Must provide a protein alignment to Bio::Align::ProteinStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet);