~ubuntu-branches/ubuntu/oneiric/bioperl/oneiric

$self->warn("Skipping $marker_name because it has $allele_count alleles (".join(',',@alleles)."), \ncomposite_LD will currently only work for biallelic markers") if $allele_count > 2;

970

next; # skip this marker

971

}

972

973

# Need to do something here to detect alleles which aren't

974

# a single character

975

if( length($alleles[0]) != 1 ||

976

length($alleles[1]) != 1 ) {

977

$self->warn("An individual has an allele which is not a single base, this is currently not supported in composite_LD - consider recoding the allele as a single character");

978

next;

979

}

980

981

# fix the call for allele 1 (A or B) and

982

# allele 2 (a or b) in terms of how we'll do the

983

# N square from Weir p.126

984

$self->debug( "$alleles[0] is 1, $alleles[1] is 2 for $marker_name\n");

985

$lookup{$marker_name}->{'1'} = $alleles[0];

986

$lookup{$marker_name}->{'2'} = $alleles[1];

987

}

988

989

@marker_names = sort keys %lookup;

990

my $site_count = scalar @marker_names;

991

# where the final data will be stored

992

my %stats_for_sites;

993

994

# standard way of generating pairwise combos

995

# LD is done by comparing all the pairwise site (marker)

996

# combinations and keeping track of the genotype and

997

# pairwise genotype (ie genotypes of the 2 sites) frequencies

998

for( my $i = 0; $i < $site_count - 1; $i++ ) {

999

my $site1 = $marker_names[$i];

1000

my (%genotypes, %total_genotype_count,

1001

%total_pairwisegeno_count,%pairwise_genotypes);

1002

for( my $j = $i+1; $j < $site_count ; $j++) {

1003

1004

my (%genotypes, %total_genotype_count,

1005

%total_pairwisegeno_count,%pairwise_genotypes);

1006

1007

my $site2 = $marker_names[$j];

1008

my (%allele_count,%allele_freqs) = (0,0);

1009

foreach my $ind ( @inds ) {

1010

# build string of genotype at site 1

1011

my ($genotype1) = $ind->get_Genotypes(-marker => $site1);

1012

my @alleles1 = sort $genotype1->get_Alleles;

1013

1014

# if an individual has only one available allele

1015

# (has a blank or N for one of the chromosomes)

1016

# we don't want to use it in our calculation

1017

1018

next unless( scalar @alleles1 == 2);

1019

my $genostr1 = join(',', @alleles1);

1020

1021

# build string of genotype at site 2

1022

my ($genotype2) = $ind->get_Genotypes(-marker => $site2);

1023

my @alleles2 = sort $genotype2->get_Alleles;

1024

my $genostr2 = join(',', @alleles2);

1025

1026

next unless( scalar @alleles2 == 2);

1027

for (@alleles1) {

1028

$allele_count{$site1}++;

1029

$allele_freqs{$site1}->{$_}++;

1030

}

1031

$genotypes{$site1}->{$genostr1}++;

1032

$total_genotype_count{$site1}++;

1033

1034

for (@alleles2) {

1035

$allele_count{$site2}++;

1036

$allele_freqs{$site2}->{$_}++;

1037

}

1038

$genotypes{$site2}->{$genostr2}++;

1039

$total_genotype_count{$site2}++;

1040

1041

# We are using the $site1,$site2 to signify

1042

# a unique key

1043

$pairwise_genotypes{"$site1,$site2"}->{"$genostr1,$genostr2"}++;

1044

# some individuals

1045

$total_pairwisegeno_count{"$site1,$site2"}++;

1046

}

1047

for my $site ( %allele_freqs ) {

1048

for my $al ( keys %{ $allele_freqs{$site} } ) {

1049

$allele_freqs{$site}->{$al} /= $allele_count{$site};

1050

}

1051

}

1052

my $n = $total_pairwisegeno_count{"$site1,$site2"}; # number of inds

1053

# 'A' and 'B' are two loci or in our case site1 and site2

1054

my $allele1_site1 = $lookup{$site1}->{'1'}; # this is the BigA allele

1055

my $allele1_site2 = $lookup{$site2}->{'1'}; # this is the BigB allele

1056

my $allele2_site1 = $lookup{$site1}->{'2'}; # this is the LittleA allele

1057

my $allele2_site2 = $lookup{$site2}->{'2'}; # this is the LittleB allele

1058

# AABB

1059

my $N1genostr = join(",",( $allele1_site1, $allele1_site1,

1060

$allele1_site2, $allele1_site2));

1061

$self->debug(" [$site1,$site2](AABB) N1genostr=$N1genostr\n");

1062

# AABb

1063

my $N2genostr = join(",",( $allele1_site1, $allele1_site1,

1064

$allele1_site2, $allele2_site2));

1065

$self->debug(" [$site1,$site2](AABb) N2genostr=$N2genostr\n");

1066

# AaBB

1067

my $N4genostr = join(",",( $allele1_site1, $allele2_site1,

1068

$allele1_site2, $allele1_site2));

1069

$self->debug(" [$site1,$site2](AaBB) N4genostr=$N4genostr\n");

1070

# AaBb

1071

my $N5genostr = join(",",( $allele1_site1, $allele2_site1,

1072

$allele1_site2, $allele2_site2));

1073

$self->debug(" [$site1,$site2](AaBb) N5genostr=$N5genostr\n");

1074

# count of AABB in

1075

my $n1 = $pairwise_genotypes{"$site1,$site2"}->{$N1genostr} || 0;

1076

# count of AABb in

1077

my $n2 = $pairwise_genotypes{"$site1,$site2"}->{$N2genostr} || 0;

1078

# count of AaBB in

1079

my $n4 = $pairwise_genotypes{"$site1,$site2"}->{$N4genostr} || 0;

1080

# count of AaBb in

1081

my $n5 = $pairwise_genotypes{"$site1,$site2"}->{$N5genostr} || 0;

1082

1083

my $homozA_site1 = join(",", ($allele1_site1,$allele1_site1));

1084

my $homozB_site2 = join(",", ($allele1_site2,$allele1_site2));

1085

my $p_AA = ($genotypes{$site1}->{$homozA_site1} || 0) / $n;

1086

my $p_BB = ($genotypes{$site2}->{$homozB_site2} || 0) / $n;

1087

my $p_A = $allele_freqs{$site1}->{$allele1_site1} || 0; # an individual allele freq

1088

my $p_a = 1 - $p_A;

1089

1090

my $p_B = $allele_freqs{$site2}->{$allele1_site2} || 0; # an individual allele freq

1091

my $p_b = 1 - $p_B;

1092

1093

# variance of allele frequencies

1094

my $pi_A = $p_A * $p_a;

1095

my $pi_B = $p_B * $p_b;

1096

1097

# hardy weinberg

1098

my $D_A = $p_AA - $p_A**2;

1099

my $D_B = $p_BB - $p_B**2;

1100

my $n_AB = 2*$n1 + $n2 + $n4 + 0.5 * $n5;

1101

$self->debug("n_AB=$n_AB -- n1=$n1, n2=$n2 n4=$n4 n5=$n5\n");

1102

1103

my $delta_AB = (1 / $n ) * ( $n_AB ) - ( 2 * $p_A * $p_B );

1104

$self->debug("delta_AB=$delta_AB -- n=$n, n_AB=$n_AB p_A=$p_A, p_B=$p_B\n");

1105

$self->debug(sprintf(" (%d * %.4f) / ( %.2f + %.2f) * ( %.2f + %.2f) \n",

1106

$n,$delta_AB**2, $pi_A, $D_A, $pi_B, $D_B));

1107

1108

my $chisquared;

1109

eval { $chisquared = ( $n * ($delta_AB**2) ) /

1110

( ( $pi_A + $D_A) * ( $pi_B + $D_B) );

1111

};

1112

if( $@ ) {

1113

$self->debug("Skipping the site because the denom is 0.\nsite1=$site1, site2=$site2 : pi_A=$pi_A, pi_B=$pi_B D_A=$D_A, D_B=$D_B\n");

1114

next;

1115

}

1116

# this will be an upper triangular matrix

1117

$stats_for_sites{$site1}->{$site2} = [$delta_AB,$chisquared];

1118

}

1119

}

1120

return %stats_for_sites;

1121

}

1122

1123

1124

Older »