1
# $Id: Statistics.pm,v 1.16 2003/12/06 18:08:46 jason Exp $
3
# BioPerl module for Bio::PopGen::Statistics
5
# Cared for by Jason Stajich <jason-at-bioperl-dot-org>
7
# Copyright Jason Stajich
9
# You may distribute this module under the same terms as perl itself
11
# POD documentation - main docs before the code
15
Bio::PopGen::Statistics - Population Genetics statistical tests
19
use Bio::PopGen::Statistics;
22
use Bio::PopGen::Simulation::Coalescent;
24
my $sim = new Bio::PopGen::Simulation::Coalescent( -sample_size => 12);
26
my $tree = $sim->next_tree;
28
$factory->add_Mutations($tree,20);
30
my $stats = new Bio::PopGen::Statistics();
31
my $individuals = [ $tree->get_leaf_nodes];
32
my $pi = $stats->pi($individuals);
33
my $D = $stats->tajima_d($individuals);
35
# Alternatively to do this on input data from
36
# See the tests in t/PopGen.t for more examples
37
my $parser = new Bio::PopGen::IO(-format => 'prettybase',
38
-file => 't/data/popstats.prettybase');
39
my $pop = $parser->next_population;
40
# Note that you can also call the stats as a class method if you like
41
# the only reason to instantiate it (as above) is if you want
42
# to set the verbosity for debugging
43
$pi = Bio::PopGen::Statistics->pi($pop);
44
$theta = Bio::PopGen::Statistics->theta($pop);
46
# Pi and Theta also take additional arguments,
47
# see the documentation for more information
50
# To come -- examples for creating pops/individuals from
51
# Aligned sequence data
55
This object is intended to provide implementations some standard
56
population genetics statistics about alleles in populations.
58
This module was previously named Bio::Tree::Statistics.
60
This object is a place to accumulate routines for calculating various
61
statistics from the coalescent simulation, marker/allele, or from
62
aligned sequence data given that you can calculate alleles, number of
65
Currently implemented:
66
Fu and Li's D (fu_and_li_D)
67
Fu and Li's D* (fu_and_li_D_star)
68
Fu and Li's F (fu_and_li_F)
71
pi (pi) - number of pairwise differences
72
composite_LD (composite_LD)
74
In all cases where a the method expects an arrayref of
75
L<Bio::PopGen::IndividualI> objects and L<Bio::PopGen::PopulationI>
76
object will also work.
80
Fu Y.X and Li W.H. (1993) "Statistical Tests of Neutrality of
81
Mutations." Genetics 133:693-709.
83
Fu Y.X. (1996) "New Statistical Tests of Neutrality for DNA samples
84
from a Population." Genetics 143:557-570.
86
Tajima F. (1989) "Statistical method for testing the neutral mutation
87
hypothesis by DNA polymorphism." Genetics 123:585-595.
94
User feedback is an integral part of the evolution of this and other
95
Bioperl modules. Send your comments and suggestions preferably to
96
the Bioperl mailing list. Your participation is much appreciated.
98
bioperl-l@bioperl.org - General discussion
99
http://bioperl.org/MailList.shtml - About the mailing lists
101
=head2 Reporting Bugs
103
Report bugs to the Bioperl bug tracking system to help us keep track
104
of the bugs and their resolution. Bug reports can be submitted via
107
http://bugzilla.bioperl.org/
109
=head1 AUTHOR - Jason Stajich, Matthew Hahn
111
Email jason-at-bioperl-dot-org
112
Matt Hahn E<lt>matthew.hahn-at-duke.dukeE<gt>
116
Additional contributors names and emails here
120
The rest of the documentation details each of the object methods.
121
Internal methods are usually preceded with a _
126
# Let the code begin...
129
package Bio::PopGen::Statistics;
135
@ISA = qw(Bio::Root::Root );
140
Usage : my $obj = new Bio::PopGen::Statistics();
141
Function: Builds a new Bio::PopGen::Statistics object
142
Returns : an instance of Bio::PopGen::Statistics
152
Usage : my $D = $statistics->fu_an_li_D(\@ingroup,$extmutations);
153
Function: Fu and Li D statistic for a list of individuals
154
given an outgroup and the number of external mutations
155
(either provided or calculated from list of outgroup individuals)
157
Args : $individuals - array refence which contains ingroup individuals
158
(L<Bio::PopGen::Individual> or derived classes)
159
$extmutations - number of external mutations OR
160
arrayref of outgroup individuals
165
my ($self,$ingroup,$outgroup) = @_;
167
my ($seg_sites,$pi,$sample_size,$ancestral,$derived);
168
if( ref($ingroup) =~ /ARRAY/i ) {
169
$sample_size = scalar @$ingroup;
170
# pi - all pairwise differences
171
$pi = $self->pi($ingroup);
172
$seg_sites = $self->segregating_sites_count($ingroup);
173
} elsif( ref($ingroup) &&
174
$ingroup->isa('Bio::PopGen::PopulationI')) {
175
$sample_size = $ingroup->get_number_individuals;
176
$pi = $self->pi($ingroup);
177
$seg_sites = $self->segregating_sites_count($ingroup);
179
$self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_D");
183
if( $seg_sites <= 0 ) {
184
$self->warn("mutation total was not > 0, cannot calculate a Fu and Li D");
188
if( ! defined $outgroup ) {
189
$self->warn("Need to provide either an array ref to the outgroup individuals or the number of external mutations");
191
} elsif( ref($outgroup) ) {
192
($ancestral,$derived) = $self->derived_mutations($ingroup,$outgroup);
194
$ancestral = $outgroup;
197
for(my $k= 1; $k < $sample_size; $k++ ) {
202
for(my $k= 1; $k < $sample_size; $k++ ) {
206
my $c = 2 * ( ( ( $sample_size * $a ) - (2 * ( $sample_size -1 ))) /
207
( ( $sample_size - 1) * ( $sample_size - 2 ) ) );
209
my $v = 1 + ( ( $a**2 / ( $b + $a**2 ) ) * ( $c - ( ( $sample_size + 1) /
210
( $sample_size - 1) ) ));
213
my $D = ( $seg_sites - ( $a * $ancestral) ) /
214
( sqrt ( ($u * $seg_sites ) + ( $v * $seg_sites **2) ) );
219
=head2 fu_and_li_D_star
221
Title : fu_and_li_D_star
222
Usage : my $D = $statistics->fu_an_li_D_star(\@individuals);
223
Function: Fu and Li's D* statistic for a set of samples
225
Returns : decimal number
226
Args : array ref of L<Bio::PopGen::IndividualI> objects
228
L<Bio::PopGen::PopulationI> object
235
sub fu_and_li_D_star {
236
my ($self,$individuals) = @_;
238
my ($seg_sites,$pi,$sample_size,$singletons);
239
if( ref($individuals) =~ /ARRAY/i ) {
240
$sample_size = scalar @$individuals;
241
# pi - all pairwise differences
242
$pi = $self->pi($individuals);
243
$seg_sites = $self->segregating_sites_count($individuals);
244
$singletons = $self->singleton_count($individuals);
245
} elsif( ref($individuals) &&
246
$individuals->isa('Bio::PopGen::PopulationI')) {
247
my $pop = $individuals;
248
$sample_size = $pop->get_number_individuals;
249
$pi = $self->pi($pop);
250
$seg_sites = $self->segregating_sites_count($pop);
251
$singletons = $self->singleton_count($pop);
253
$self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to tajima_D");
258
for(my $k= 1; $k < $sample_size; $k++ ) {
263
for(my $k= 1; $k <= $sample_size; $k++ ) {
268
for(my $k= 1; $k < $sample_size; $k++ ) {
272
my $c = 2 * ( ( ( $sample_size * $a ) - (2 * ( $sample_size -1 ))) /
273
( ( $sample_size - 1) * ( $sample_size - 2 ) ) );
275
my $d = $c + ( ($sample_size -2) / ($sample_size - 1)**2 ) +
276
( 2 / ($sample_size -1) *
277
( (3/2) - ( (2*$a1 - 3) / ($sample_size -2) ) -
280
my $v_star = ( ( ($sample_size/($sample_size-1) )**2)*$b + (($a**2)*$d) -
281
(2*( ($sample_size*$a*($a+1)) )/(($sample_size-1)**2)) ) /
284
my $u_star = ( ($sample_size/($sample_size-1))*
286
($sample_size-1)))) - $v_star;
288
my $D_star = ( (($sample_size/($sample_size-1))*$seg_sites) -
290
( sqrt( ($u_star*$seg_sites) + ($v_star*($seg_sites**2)) ));
297
Usage : my $D = Bio::PopGen::Statistics->fu_and_li_F(\@ingroup,$ext_muts);
298
Function: Calculate Fu and Li's F on an ingroup with either the set of
299
outgroup individuals, or the number of external mutations
300
Returns : decimal number
301
Args : array ref of L<Bio::PopGen::IndividualI> objects for the ingroup
302
OR a L<Bio::PopGen::PopulationI> object
303
number of external mutations OR list of individuals for the outgroup
310
my ($self,$ingroup,$outgroup) = @_;
311
my ($seg_sites,$pi,$sample_size,$external,$internal);
312
if( ref($ingroup) =~ /ARRAY/i ) {
313
$sample_size = scalar @$ingroup;
314
# pi - all pairwise differences
315
$pi = $self->pi($ingroup);
316
$seg_sites = $self->segregating_sites_count($ingroup);
317
} elsif( ref($ingroup) &&
318
$ingroup->isa('Bio::PopGen::PopulationI')) {
319
$sample_size = $ingroup->get_number_individuals;
320
$pi = $self->pi($ingroup);
321
$seg_sites = $self->segregating_sites_count($ingroup);
323
$self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to tajima_D");
327
if( ! defined $outgroup ) {
328
$self->warn("Need to provide either an array ref to the outgroup individuals or the number of external mutations");
330
} elsif( ref($outgroup) ) {
331
($external,$internal) = $self->derived_mutations($ingroup,$outgroup);
333
$external = $outgroup;
337
for(my $k= 1; $k < $sample_size; $k++ ) {
342
for(my $k= 1; $k <= $sample_size; $k++ ) {
347
for(my $k= 1; $k < $sample_size; $k++ ) {
351
my $c = 2 * ( ( ( $sample_size * $a ) - (2 * ( $sample_size -1 ))) /
352
( ( $sample_size - 1) * ( $sample_size - 2 ) ) );
354
my $v_F = ( $c + ( (2*(($sample_size**2)+$sample_size+3)) /
355
( (9*$sample_size)*($sample_size-1) ) ) -
356
(2/($sample_size-1)) ) / ( ($a**2)+$b );
358
my $u_F = ( 1 + ( ($sample_size+1)/(3*($sample_size-1)) )-
359
( 4*( ($sample_size+1)/(($sample_size-1)**2) ))*
360
($a1 - ((2*$sample_size)/($sample_size+1))) ) /
363
my $F = ($pi - $external) / ( sqrt( ($u_F*$seg_sites) +
364
($v_F*($seg_sites**2)) ) );
369
=head2 fu_and_li_F_star
371
Title : fu_and_li_F_star
372
Usage : my $D = Bio::PopGen::Statistics->fu_and_li_F_star(\@ingroup);
373
Function: Calculate Fu and Li's F* on an ingroup without an outgroup
374
It uses count of singleton alleles instead
375
Returns : decimal number
376
Args : array ref of L<Bio::PopGen::IndividualI> objects for the ingroup
378
L<Bio::PopGen::PopulationI> object
382
#' keep my emacs happy
384
sub fu_and_li_F_star {
385
my ($self,$individuals) = @_;
387
my ($seg_sites,$pi,$sample_size,$singletons);
388
if( ref($individuals) =~ /ARRAY/i ) {
389
$sample_size = scalar @$individuals;
390
# pi - all pairwise differences
391
$pi = $self->pi($individuals);
392
$seg_sites = $self->segregating_sites_count($individuals);
393
$singletons = $self->singleton_count($individuals);
394
} elsif( ref($individuals) &&
395
$individuals->isa('Bio::PopGen::PopulationI')) {
396
my $pop = $individuals;
397
$sample_size = $pop->get_number_individuals;
398
$pi = $self->pi($pop);
399
$seg_sites = $self->segregating_sites_count($pop);
400
$singletons = $self->singleton_count($pop);
402
$self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to tajima_D");
406
for(my $k= 1; $k < $sample_size; $k++ ) {
411
for(my $k= 1; $k <= $sample_size; $k++ ) {
416
for(my $k= 1; $k < $sample_size; $k++ ) {
420
my $c = 2 * ( (($sample_size * $a) - (2 * ( $sample_size -1 ))) /
421
(( $sample_size - 1) * ($sample_size - 2)) );
423
my $d = $c + ( ($sample_size -2)/ (($sample_size - 1)**2)) +
424
((2/($sample_size -1))*
425
((3/2) - ((2*$a1 - 3)/($sample_size -2)) -
428
my $v_F_star = ( $d + ( 2*($sample_size**2+$sample_size+3) /
429
(9*$sample_size*($sample_size-1))) -
430
( (2/($sample_size-1))*
431
(4*$b - 6 + (8/$sample_size))) )/
434
my $u_F_star = ( ($sample_size / ($sample_size-1)) +
435
(($sample_size+1)/(3*($sample_size-1))) -
436
( 2 * (2 / ($sample_size * ($sample_size-1)))) +
437
(2*( ($sample_size+1)/($sample_size-1)**2)*
438
($a1 - ((2*$sample_size)/($sample_size+1))) )) /
441
my $F_star = ( $pi - (( ($sample_size-1)/ $sample_size)*$singletons)) /
442
sqrt ( ($u_F_star*$seg_sites) + ($v_F_star*($seg_sites**2)));
449
Usage : my $D = Bio::PopGen::Statistics->tajima_D(\@samples);
450
Function: Calculate Tajima's D on a set of samples
451
Returns : decimal number
452
Args : array ref of L<Bio::PopGen::IndividualI> objects
454
L<Bio::PopGen::PopulationI> object
462
my ($self,$individuals) = @_;
463
my ($seg_sites,$pi,$sample_size);
465
if( ref($individuals) =~ /ARRAY/i ) {
466
$sample_size = scalar @$individuals;
467
# pi - all pairwise differences
468
$pi = $self->pi($individuals);
469
$seg_sites = $self->segregating_sites_count($individuals);
471
} elsif( ref($individuals) &&
472
$individuals->isa('Bio::PopGen::PopulationI')) {
473
my $pop = $individuals;
474
$sample_size = $pop->get_number_individuals;
475
$pi = $self->pi($pop);
476
$seg_sites = $self->segregating_sites_count($pop);
478
$self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to tajima_D");
482
for(my $k= 1; $k < $sample_size; $k++ ) {
487
for(my $k= 1; $k < $sample_size; $k++ ) {
488
$a2 += ( 1 / $k**2 );
492
my $b1 = ( $sample_size + 1 ) / ( 3* ( $sample_size - 1) );
493
my $b2 = ( 2 * ( $sample_size ** 2 + $sample_size + 3) ) /
494
( ( 9 * $sample_size) * ( $sample_size - 1) );
495
my $c1 = $b1 - ( 1 / $a1 );
496
my $c2 = $b2 - ( ( $sample_size + 2 ) /
497
( $a1 * $sample_size))+( $a2 / $a1 ** 2);
499
my $e2 = $c2 / ( $a1**2 + $a2 );
501
my $D = ( $pi - ( $seg_sites / $a1 ) ) /
502
sqrt ( ($e1 * $seg_sites) + (( $e2 * $seg_sites) * ( $seg_sites - 1)));
510
Usage : my $pi = Bio::PopGen::Statistics->pi(\@inds)
511
Function: Calculate pi (...explain here...) given a list of individuals
512
which have the same number of markers/sites/mutation as
513
available from the get_Genotypes() call in
514
L<Bio::PopGen::IndividualI>
515
Returns : decimal number
516
Args : Arg1= array ref of L<Bio::PopGen::IndividualI> objects
517
which have markers/mutations. We expect all individuals to
518
have a marker - we will deal with missing data as a special case.
520
Arg1= L<Bio::PopGen::PopulationI> object. In the event that
521
only allele frequency data is available, storing it in
522
Population object will make this available.
523
num sites [optional], an optional second argument (integer)
524
which is the number of sites, then pi returned is pi/site.
529
my ($self,$individuals,$numsites) = @_;
530
my (%data,@marker_names,$sample_size);
532
if( ref($individuals) =~ /ARRAY/i ) {
533
# one possible argument is an arrayref of Bio::PopGen::IndividualI objs
534
@marker_names = $individuals->[0]->get_marker_names;
535
$sample_size = scalar @$individuals;
537
# Here we're calculating the allele frequencies
539
foreach my $ind ( @$individuals ) {
540
if( ! $ind->isa('Bio::PopGen::IndividualI') ) {
541
$self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($ind)."\n");
544
foreach my $m ( @marker_names ) {
545
foreach my $allele (map { $_->get_Alleles}
546
$ind->get_Genotypes($m) ) {
547
$data{$m}->{$allele}++;
552
while( my ($marker,$count) = each %marker_total ) {
553
foreach my $c ( values %{$data{$marker}} ) {
557
# %data will contain allele frequencies for each marker, allele
558
} elsif( ref($individuals) &&
559
$individuals->isa('Bio::PopGen::PopulationI') ) {
560
my $pop = $individuals;
561
$sample_size = $pop->get_number_individuals;
562
foreach my $marker( $pop->get_Markers ) {
563
push @marker_names, $marker->name;
564
$data{$marker->name} = {$marker->get_Allele_Frequencies};
567
$self->throw("expected an array reference of a list of Bio::PopGen::IndividualI to pi");
569
# doing all pairwise combinations
571
# For now we assume that all individuals have the same markers
572
my ($diffcount,$totalcompare) = (0,0);
574
foreach my $markerdat ( values %data ) {
575
my $totalalleles; # this will only be different among markers
576
# when there is missing data
577
my @alleles = keys %$markerdat;
578
foreach my $al ( @alleles ) { $totalalleles += $markerdat->{$al} }
579
for( my $i =0; $i < scalar @alleles -1; $i++ ) {
580
my ($a1,$a2) = ( $alleles[$i], $alleles[$i+1]);
581
$pi += $self->heterozygosity($sample_size,
582
$markerdat->{$a1} / $totalalleles,
583
$markerdat->{$a2} / $totalalleles);
586
$self->debug( "pi=$pi\n");
588
return $pi / $numsites;
597
Usage : my $theta = Bio::PopGen::Statistics->theta($sampsize,$segsites);
598
Function: Calculates theta (...explanation here... ) from the sample size
599
and the number of segregating sites.
600
Providing the third parameter, total number of sites will
601
return theta per site
602
Returns : decimal number
603
Args : sample size (integer),
604
num segregating sites (integer)
605
total sites (integer) [optional] (to calculate theta per site)
607
provide an arrayref of the L<Bio::PopGen::IndividualI> objects
608
total sites (integer) [optional] (to calculate theta per site)
610
provide an L<Bio::PopGen::PopulationI> object
611
total sites (integer)[optional]
617
my ( $sample_size, $seg_sites,$totalsites) = @_;
618
if( ref($sample_size) =~ /ARRAY/i ) {
619
my $samps = $sample_size;
620
$totalsites = $seg_sites; # only 2 arguments if one is an array
622
my @marker_names = $samps->[0]->get_marker_names;
623
# we need to calculate number of polymorphic sites
624
$seg_sites = $self->segregating_sites_count($samps);
625
$sample_size = scalar @$samps;
627
} elsif(ref($sample_size) &&
628
$sample_size->isa('Bio::PopGen::PopulationI') ) {
629
# This will handle the case when we pass in a PopulationI object
630
my $pop = $sample_size;
631
$totalsites = $seg_sites; # shift the arguments over by one
632
$sample_size = $pop->get_number_individuals;
633
$seg_sites = $self->segregating_sites_count($pop);
636
for(my $k= 1; $k < $sample_size; $k++ ) {
639
if( $totalsites ) { # 0 and undef are the same can't divide by them
640
$seg_sites /= $totalsites;
642
return $seg_sites / $a1;
645
=head2 singleton_count
647
Title : singleton_count
648
Usage : my ($singletons) = Bio::PopGen::Statistics->singleton_count(\@inds)
649
Function: Calculate the number of mutations/alleles which only occur once in
650
a list of individuals for all sites/markers
651
Returns : (integer) number of alleles which only occur once (integer)
652
Args : arrayref of L<Bio::PopGen::IndividualI> objects
654
L<Bio::PopGen::PopulationI> object
658
sub singleton_count {
659
my ($self,$individuals) = @_;
662
if( ref($individuals) =~ /ARRAY/ ) {
663
@inds = @$individuals;
664
} elsif( ref($individuals) &&
665
$individuals->isa('Bio::PopGen::PopulationI') ) {
666
my $pop = $individuals;
667
@inds = $pop->get_Individuals();
669
$self->warn("Need to provide a population which has individuals loaded, not just a population with allele frequencies");
673
$self->warn("Expected either a PopulationI object or an arrayref of IndividualI objects");
676
# find number of sites where a particular allele is only seen once
678
my ($singleton_allele_ct,%sites) = (0);
679
# first collect all the alleles into a hash structure
681
foreach my $n ( @inds ) {
682
if( ! $n->isa('Bio::PopGen::IndividualI') ) {
683
$self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($n)."\n");
686
foreach my $g ( $n->get_Genotypes ) {
687
my ($nm,@alleles) = ($g->marker_name, $g->get_Alleles);
688
foreach my $allele (@alleles ) {
689
$sites{$nm}->{$allele}++;
693
foreach my $site ( values %sites ) { # don't really care what the name is
694
foreach my $allelect ( values %$site ) { #
695
# find the sites which have an allele with only 1 copy
696
$singleton_allele_ct++ if( $allelect == 1 );
699
return $singleton_allele_ct;
702
# Yes I know that singleton_count and segregating_sites_count are
703
# basically processing the same data so calling them both is
704
# redundant, something I want to fix later but want to make things
705
# correct and simple first
707
=head2 segregating_sites_count
709
Title : segregating_sites_count
710
Usage : my $segsites = Bio::PopGen::Statistics->segregating_sites_count
711
Function: Gets the number of segregating sites (number of polymorphic sites)
712
Returns : (integer) number of segregating sites
713
Args : arrayref of L<Bio::PopGen::IndividualI> objects
715
L<Bio::PopGen::PopulationI> object
719
# perhaps we'll change this in the future
720
# to return the actual segregating sites
721
# so one can use this to pull in the names of those sites.
722
# Would be trivial if it is useful.
724
sub segregating_sites_count{
725
my ($self,$individuals) = @_;
726
my $type = ref($individuals);
728
if( $type =~ /ARRAY/i ) {
730
foreach my $n ( @$individuals ) {
731
if( ! $n->isa('Bio::PopGen::IndividualI') ) {
732
$self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($n)."\n");
735
foreach my $g ( $n->get_Genotypes ) {
736
my ($nm,@alleles) = ($g->marker_name, $g->get_Alleles);
737
foreach my $allele (@alleles ) {
738
$sites{$nm}->{$allele}++;
742
foreach my $site ( values %sites ) { # use values b/c we don't
743
# really care what the name is
744
# find the sites which >1 allele
745
$seg_sites++ if( keys %$site > 1 );
747
} elsif( $type && $individuals->isa('Bio::PopGen::PopulationI') ) {
748
foreach my $marker ( $individuals->get_Markers ) {
749
my @alleles = $marker->get_Alleles;
750
$seg_sites++ if ( scalar @alleles > 1 );
753
$self->warn("segregating_sites_count expects either a PopulationI object or a list of IndividualI objects");
760
=head2 heterozygosity
762
Title : heterozygosity
763
Usage : my $het = Bio::PopGen::Statistics->heterozygosity($sampsize,$freq1);
764
Function: Calculate the heterozgosity for a sample set for a set of alleles
765
Returns : decimal number
766
Args : sample size (integer)
767
frequency of one allele (fraction - must be less than 1)
768
[optional] frequency of another allele - this is only needed
769
in a non-binary allele system
771
Note : p^2 + 2pq + q^2
777
my ($self,$samp_size, $freq1,$freq2) = @_;
778
if( ! $freq2 ) { $freq2 = 1 - $freq1 }
779
if( $freq1 > 1 || $freq2 > 1 ) {
780
$self->warn("heterozygosity expects frequencies to be less than 1");
782
my $sum = ($freq1**2) + (($freq2)**2);
783
my $h = ( $samp_size*(1- $sum) ) / ($samp_size - 1) ;
787
=head2 derived_mutations
789
Title : derived_mutations
790
Usage : my $ext = Bio::PopGen::Statistics->derived_mutations($ingroup,$outgroup);
791
Function: Calculate the number of alleles or (mutations) which are ancestral
792
and the number which are derived (occurred only on the tips)
793
Returns : array of 2 items - number of external and internal derived
795
Args : ingroup - L<Bio::PopGen::IndividualI>s arrayref OR
796
L<Bio::PopGen::PopulationI>
797
outgroup- L<Bio::PopGen::IndividualI>s arrayref OR
798
L<Bio::PopGen::PopulationI> OR
799
a single L<Bio::PopGen::IndividualI>
803
sub derived_mutations{
804
my ($self,$ingroup,$outgroup) = @_;
805
my (%indata,%outdata,@marker_names);
807
# basically we have to do some type checking
808
# if that perl were typed...
809
my ($itype,$otype) = (ref($ingroup),ref($outgroup));
811
return $outgroup unless( $otype ); # we expect arrayrefs or objects, nums
812
# are already the value we
814
# pick apart the ingroup
816
if( ref($ingroup) =~ /ARRAY/i ) {
817
if( ! ref($ingroup->[0]) ||
818
! $ingroup->[0]->isa('Bio::PopGen::IndividualI') ) {
819
$self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects or a Population for ingroup in external_mutations");
822
# we assume that all individuals have the same markers
823
# i.e. that they are aligned
824
@marker_names = $ingroup->[0]->get_marker_names;
825
for my $ind ( @$ingroup ) {
826
for my $m ( @marker_names ) {
827
for my $allele ( map { $_->get_Alleles }
828
$ind->get_Genotypes($m) ) {
829
$indata{$m}->{$allele}++;
833
} elsif( ref($ingroup) && $ingroup->isa('Bio::PopGen::PopulationI') ) {
834
@marker_names = $ingroup->get_marker_names;
835
for my $ind ( $ingroup->get_Individuals() ) {
836
for my $m ( @marker_names ) {
837
for my $allele ( map { $_->get_Alleles}
838
$ind->get_Genotypes($m) ) {
839
$indata{$m}->{$allele}++;
844
$self->warn("Need an arrayref of Bio::PopGen::IndividualI objs or a Bio::PopGen::Population for ingroup in external_mutations");
848
if( $otype =~ /ARRAY/i ) {
849
if( ! ref($outgroup->[0]) ||
850
! $outgroup->[0]->isa('Bio::PopGen::IndividualI') ) {
851
$self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects or a Population for outgroup in external_mutations");
854
for my $ind ( @$outgroup ) {
855
for my $m ( @marker_names ) {
856
for my $allele ( map { $_->get_Alleles }
857
$ind->get_Genotypes($m) ) {
858
$outdata{$m}->{$allele}++;
863
} elsif( $otype->isa('Bio::PopGen::PopulationI') ) {
864
for my $ind ( $outgroup->get_Individuals() ) {
865
for my $m ( @marker_names ) {
866
for my $allele ( map { $_->get_Alleles}
867
$ind->get_Genotypes($m) ) {
868
$outdata{$m}->{$allele}++;
872
} elsif( $otype->isa('Bio::PopGen::PopulationI') ) {
873
$self->warn("Need an arrayref of Bio::PopGen::IndividualI objs or a Bio::PopGen::Population for outgroup in external_mutations");
877
# derived mutations are defined as
881
# derived mutations are G and T, A is the external mutation
885
# derived mutations A,T no external/ancestral mutations
891
my ($internal,$external);
892
foreach my $marker ( @marker_names ) {
893
my @outalleles = keys %{$outdata{$marker}};
894
my @in_alleles = keys %{$indata{$marker}};
895
next if( @outalleles > 1 || @in_alleles == 1);
896
for my $allele ( @in_alleles ) {
897
if( ! exists $outdata{$marker}->{$allele} ) {
898
if( $indata{$marker}->{$allele} == 1 ) {
906
return ($external, $internal);
913
Usage : %matrix = Bio::PopGen::Statistics->composite_LD($population);
914
Function: Calculate the Linkage Disequilibrium
915
This is for calculating LD for unphased data.
916
Other methods will be appropriate for phased haplotype data.
918
Returns : Hash of Hashes - first key is site 1,second key is site 2
919
and value is LD for those two sites.
920
my $LDarrayref = $matrix{$site1}->{$site2};
921
my ($ldval, $chisquared) = @$LDarrayref;
922
Args : L<Bio::PopGen::PopulationI> or arrayref of
923
L<Bio::PopGen::IndividualI>s
924
Reference: Weir B.S. (1996) "Genetic Data Analysis II",
925
Sinauer, Sunderlanm MA.
930
my ($self,$pop) = @_;
931
if( ref($pop) =~ /ARRAY/i ) {
932
if( ref($pop->[0]) && $pop->[0]->isa('Bio::PopGen::IndividualI') ) {
933
$pop = new Bio::PopGen::Population(-individuals => @$pop);
935
$self->warn("composite_LD expects a Bio::PopGen::PopulationI or an arrayref of Bio::PopGen::IndividualI objects");
938
} elsif( ! ref($pop) || ! $pop->isa('Bio::PopGen::PopulationI') ) {
939
$self->warn("composite_LD expects a Bio::PopGen::PopulationI or an arrayref of Bio::PopGen::IndividualI objects");
943
my @marker_names = $pop->get_marker_names;
944
my @inds = $pop->get_Individuals;
945
my $num_inds = scalar @inds;
947
# calculate allele frequencies for each marker from the population
948
# use the built-in get_Marker to get the allele freqs
949
# we still need to calculate the genotype frequencies
950
foreach my $marker_name ( @marker_names ) {
953
foreach my $ind ( @inds ) {
954
my ($genotype) = $ind->get_Genotypes(-marker => $marker_name);
955
my @alleles = sort $genotype->get_Alleles;
956
next if( scalar @alleles != 2);
957
my $genostr = join(',', @alleles);
958
$allelef{$alleles[0]}++;
959
$allelef{$alleles[1]}++;
962
# we should check for cases where there > 2 alleles or
963
# only 1 allele and throw out those markers.
964
my @alleles = sort keys %allelef;
965
my $allele_count = scalar @alleles;
966
# test if site is polymorphic
967
if( $allele_count != 2) {
968
# only really warn if we're seeing multi-allele
969
$self->warn("Skipping $marker_name because it has $allele_count alleles (".join(',',@alleles)."), \ncomposite_LD will currently only work for biallelic markers") if $allele_count > 2;
970
next; # skip this marker
973
# Need to do something here to detect alleles which aren't
975
if( length($alleles[0]) != 1 ||
976
length($alleles[1]) != 1 ) {
977
$self->warn("An individual has an allele which is not a single base, this is currently not supported in composite_LD - consider recoding the allele as a single character");
981
# fix the call for allele 1 (A or B) and
982
# allele 2 (a or b) in terms of how we'll do the
983
# N square from Weir p.126
984
$self->debug( "$alleles[0] is 1, $alleles[1] is 2 for $marker_name\n");
985
$lookup{$marker_name}->{'1'} = $alleles[0];
986
$lookup{$marker_name}->{'2'} = $alleles[1];
989
@marker_names = sort keys %lookup;
990
my $site_count = scalar @marker_names;
991
# where the final data will be stored
994
# standard way of generating pairwise combos
995
# LD is done by comparing all the pairwise site (marker)
996
# combinations and keeping track of the genotype and
997
# pairwise genotype (ie genotypes of the 2 sites) frequencies
998
for( my $i = 0; $i < $site_count - 1; $i++ ) {
999
my $site1 = $marker_names[$i];
1000
my (%genotypes, %total_genotype_count,
1001
%total_pairwisegeno_count,%pairwise_genotypes);
1002
for( my $j = $i+1; $j < $site_count ; $j++) {
1004
my (%genotypes, %total_genotype_count,
1005
%total_pairwisegeno_count,%pairwise_genotypes);
1007
my $site2 = $marker_names[$j];
1008
my (%allele_count,%allele_freqs) = (0,0);
1009
foreach my $ind ( @inds ) {
1010
# build string of genotype at site 1
1011
my ($genotype1) = $ind->get_Genotypes(-marker => $site1);
1012
my @alleles1 = sort $genotype1->get_Alleles;
1014
# if an individual has only one available allele
1015
# (has a blank or N for one of the chromosomes)
1016
# we don't want to use it in our calculation
1018
next unless( scalar @alleles1 == 2);
1019
my $genostr1 = join(',', @alleles1);
1021
# build string of genotype at site 2
1022
my ($genotype2) = $ind->get_Genotypes(-marker => $site2);
1023
my @alleles2 = sort $genotype2->get_Alleles;
1024
my $genostr2 = join(',', @alleles2);
1026
next unless( scalar @alleles2 == 2);
1028
$allele_count{$site1}++;
1029
$allele_freqs{$site1}->{$_}++;
1031
$genotypes{$site1}->{$genostr1}++;
1032
$total_genotype_count{$site1}++;
1035
$allele_count{$site2}++;
1036
$allele_freqs{$site2}->{$_}++;
1038
$genotypes{$site2}->{$genostr2}++;
1039
$total_genotype_count{$site2}++;
1041
# We are using the $site1,$site2 to signify
1043
$pairwise_genotypes{"$site1,$site2"}->{"$genostr1,$genostr2"}++;
1045
$total_pairwisegeno_count{"$site1,$site2"}++;
1047
for my $site ( %allele_freqs ) {
1048
for my $al ( keys %{ $allele_freqs{$site} } ) {
1049
$allele_freqs{$site}->{$al} /= $allele_count{$site};
1052
my $n = $total_pairwisegeno_count{"$site1,$site2"}; # number of inds
1053
# 'A' and 'B' are two loci or in our case site1 and site2
1054
my $allele1_site1 = $lookup{$site1}->{'1'}; # this is the BigA allele
1055
my $allele1_site2 = $lookup{$site2}->{'1'}; # this is the BigB allele
1056
my $allele2_site1 = $lookup{$site1}->{'2'}; # this is the LittleA allele
1057
my $allele2_site2 = $lookup{$site2}->{'2'}; # this is the LittleB allele
1059
my $N1genostr = join(",",( $allele1_site1, $allele1_site1,
1060
$allele1_site2, $allele1_site2));
1061
$self->debug(" [$site1,$site2](AABB) N1genostr=$N1genostr\n");
1063
my $N2genostr = join(",",( $allele1_site1, $allele1_site1,
1064
$allele1_site2, $allele2_site2));
1065
$self->debug(" [$site1,$site2](AABb) N2genostr=$N2genostr\n");
1067
my $N4genostr = join(",",( $allele1_site1, $allele2_site1,
1068
$allele1_site2, $allele1_site2));
1069
$self->debug(" [$site1,$site2](AaBB) N4genostr=$N4genostr\n");
1071
my $N5genostr = join(",",( $allele1_site1, $allele2_site1,
1072
$allele1_site2, $allele2_site2));
1073
$self->debug(" [$site1,$site2](AaBb) N5genostr=$N5genostr\n");
1075
my $n1 = $pairwise_genotypes{"$site1,$site2"}->{$N1genostr} || 0;
1077
my $n2 = $pairwise_genotypes{"$site1,$site2"}->{$N2genostr} || 0;
1079
my $n4 = $pairwise_genotypes{"$site1,$site2"}->{$N4genostr} || 0;
1081
my $n5 = $pairwise_genotypes{"$site1,$site2"}->{$N5genostr} || 0;
1083
my $homozA_site1 = join(",", ($allele1_site1,$allele1_site1));
1084
my $homozB_site2 = join(",", ($allele1_site2,$allele1_site2));
1085
my $p_AA = ($genotypes{$site1}->{$homozA_site1} || 0) / $n;
1086
my $p_BB = ($genotypes{$site2}->{$homozB_site2} || 0) / $n;
1087
my $p_A = $allele_freqs{$site1}->{$allele1_site1} || 0; # an individual allele freq
1090
my $p_B = $allele_freqs{$site2}->{$allele1_site2} || 0; # an individual allele freq
1093
# variance of allele frequencies
1094
my $pi_A = $p_A * $p_a;
1095
my $pi_B = $p_B * $p_b;
1098
my $D_A = $p_AA - $p_A**2;
1099
my $D_B = $p_BB - $p_B**2;
1100
my $n_AB = 2*$n1 + $n2 + $n4 + 0.5 * $n5;
1101
$self->debug("n_AB=$n_AB -- n1=$n1, n2=$n2 n4=$n4 n5=$n5\n");
1103
my $delta_AB = (1 / $n ) * ( $n_AB ) - ( 2 * $p_A * $p_B );
1104
$self->debug("delta_AB=$delta_AB -- n=$n, n_AB=$n_AB p_A=$p_A, p_B=$p_B\n");
1105
$self->debug(sprintf(" (%d * %.4f) / ( %.2f + %.2f) * ( %.2f + %.2f) \n",
1106
$n,$delta_AB**2, $pi_A, $D_A, $pi_B, $D_B));
1109
eval { $chisquared = ( $n * ($delta_AB**2) ) /
1110
( ( $pi_A + $D_A) * ( $pi_B + $D_B) );
1113
$self->debug("Skipping the site because the denom is 0.\nsite1=$site1, site2=$site2 : pi_A=$pi_A, pi_B=$pi_B D_A=$D_A, D_B=$D_B\n");
1116
# this will be an upper triangular matrix
1117
$stats_for_sites{$site1}->{$site2} = [$delta_AB,$chisquared];
1120
return %stats_for_sites;