13
#include "gpatInfo.hpp"
16
using namespace vcflib;
28
vector<int> questionable;
29
vector<int> geno_index ;
30
vector< vector< double > > unphred_p;
34
double unphred(string phred){
35
double unphred = atof(phred.c_str());
36
unphred = unphred / -10;
40
void initPop(pop & population){
52
void loadPop( vector< map< string, vector<string> > >& group, pop & population){
54
vector< map< string, vector<string> > >::iterator targ_it = group.begin();
58
for(; targ_it != group.end(); targ_it++){
60
string genotype = (*targ_it)["GT"].front();
62
vector<double> phreds;
64
phreds.push_back( unphred((*targ_it)["PL"][0]));
65
phreds.push_back( unphred((*targ_it)["PL"][1]));
66
phreds.push_back( unphred((*targ_it)["PL"][2]));
69
double norm = log(exp(phreds[0]) + exp(phreds[1]) + exp(phreds[2]));
71
population.unphred_p.push_back(phreds);
74
if(genotype == "0/0"){
75
population.ngeno += 1;
76
population.nhomr += 1;
78
population.geno_index.push_back(0);
79
scaled = exp(phreds[0] - norm);
82
if(genotype == "0/1"){
83
population.ngeno += 1;
87
population.geno_index.push_back(1);
88
scaled = exp(phreds[1] - norm);
91
if(genotype == "1/1"){
92
population.ngeno += 1;
93
population.nhoma += 1;
95
population.geno_index.push_back(2);
96
scaled = exp(phreds[2] - norm);
99
if(genotype == "0|0"){
100
population.ngeno += 1;
101
population.nhomr += 1;
102
population.nref += 2;
103
population.geno_index.push_back(0);
104
scaled = exp(phreds[0] - norm);
107
if(genotype == "0|1"){
108
population.ngeno += 1;
109
population.nhet += 1;
110
population.nref += 1;
111
population.nalt += 1;
112
population.geno_index.push_back(1);
113
scaled = exp(phreds[1] - norm);
116
if(genotype == "1|1"){
117
population.ngeno += 1;
118
population.nhoma += 1;
119
population.nalt += 2;
120
population.geno_index.push_back(2);
121
scaled = exp(phreds[2] - norm);
124
cerr << "FATAL: unknown genotype" << endl;
129
population.questionable.push_back(index);
134
if(population.nalt == 0 && population.nref == 0){
139
population.af = (population.nalt / (population.nref + population.nalt));
141
if(population.nhet > 0){
142
population.fis = ( 1 - ((population.nhet/population.ngeno) / (2*population.af*(1 - population.af))));
147
if(population.fis < 0){
148
population.fis = 0.00001;
153
double bound(double v){
164
void phardy(vector<double>& results, double af, double fis){
166
double p0 = pow((1 - af),2) + ((1 - af)*af*fis);
167
double p1 = 2*(1 - af)*af*(1 - fis);
168
double p2 = pow(af,2) + ((1 - af)*af*fis);
170
results.push_back(p0);
171
results.push_back(p1);
172
results.push_back(p2);
176
double likelihood(pop & population, double af, double fis){
185
double loglikelihood = 0;
187
vector<double> genotypeProbs;
189
phardy(genotypeProbs, af, fis);
191
vector<int>::iterator it = population.geno_index.begin();
195
for(; it != population.geno_index.end(); it++){
197
double aa = population.unphred_p[geno_indx][0] + log(genotypeProbs[0]);
198
double ab = population.unphred_p[geno_indx][1] + log(genotypeProbs[1]);
199
double bb = population.unphred_p[geno_indx][2] + log(genotypeProbs[2]);
200
double norm = exp(aa) + exp(ab) + exp(bb);
202
double prop = population.unphred_p[geno_indx][*it] + log(genotypeProbs[*it]);
204
loglikelihood += (prop - norm);
209
return loglikelihood;
213
double FullProb(pop & target, pop & back, vector<double>& p)
216
// parameters targetAf backgroundAf targetFis backgroundFis totalAf fst
218
double alpha = ( (1-p[5])/p[5] ) * p[4];
219
double beta = ( (1-p[5])/p[5] ) * (1 - p[4]);
222
double afprior = log( r8_normal_pdf (p[6], 0.1, p[4]));
223
double afpriorT = log( r8_normal_pdf (target.af, 0.05, p[0]));
224
double afpriorB = log( r8_normal_pdf (back.af, 0.05, p[1]));
226
if(std::isinf(afprior) || std::isnan(afprior)){
230
double ptaf = log( r8_beta_pdf(alpha, beta, p[0]) );
231
double pbaf = log( r8_beta_pdf(alpha, beta, p[1]) );
233
if( std::isinf(ptaf) || std::isnan(ptaf) || std::isinf(pbaf) || std::isnan(pbaf) ){
238
double llt = likelihood(target, p[0], p[2]);
239
double llb = likelihood(back, p[1], p[3]);
240
double full = llt + llb + ptaf + pbaf + afprior + afpriorT + afpriorB;
249
void updateParameters(pop & target, pop & background, vector<double>& parameters, int pindx){
251
// parameters targetAf backgroundAf targetFis backgroundFis totalAf fst
253
double origpar = parameters[pindx];
255
double accept = ((double)rand() / (double)(RAND_MAX));
256
double up = ((double)rand() / (double)(RAND_MAX))/10 - 0.05;
257
double updatep = parameters[pindx] + up;
259
// cerr << accept << "\t" << up << endl;
261
if(updatep >= 1 || updatep <= 0){
265
double llB = FullProb(target, background, parameters);
266
parameters[pindx] = updatep;
267
double llT = FullProb(target, background, parameters);
269
if((llT - llB) > accept){
273
parameters[pindx] = origpar;
277
void updateGenotypes(pop & target, pop & background, vector<double>& parameters, int gindex, int tbindex){
279
// tbindex indicates if the subroutine will update the target or background genotype;
281
double accept = ((double)rand() / (double)(RAND_MAX));
282
int newGindex = rand() % 3;
284
//cerr << newGindex << endl;
285
//cerr << "gindex " << gindex << endl;
286
//cerr << "gsize t:" << target.geno_index.size() << endl;
287
//cerr << "gsize b:" << background.geno_index.size() << endl;
291
int oldtindex = target.geno_index[gindex] ;
292
int oldbindex = background.geno_index[gindex] ;
294
double llB = FullProb(target, background, parameters);
298
target.geno_index[gindex] = newGindex;
302
background.geno_index[gindex] = newGindex;
304
double llT = FullProb(target, background, parameters);
306
if((llT - llB) > accept){
311
target.geno_index[gindex] = oldtindex;
314
target.geno_index[gindex] = oldbindex;
320
int cmp(const void *x, const void *y)
322
double xx = *(double*)x, yy = *(double*)y;
323
if (xx < yy) return -1;
324
if (xx > yy) return 1;
329
void loadIndices(map<int, int> & index, string set){
331
vector<string> indviduals = split(set, ",");
333
vector<string>::iterator it = indviduals.begin();
335
for(; it != indviduals.end(); it++){
336
index[ atoi( (*it).c_str() ) ] = 1;
341
int main(int argc, char** argv) {
343
// set the random seed for MCMC
345
srand((unsigned)time(NULL));
349
string filename = "NA";
351
// using vcflib; thanks to Erik Garrison
353
VariantCallFile variantFile ;
355
// zero based index for the target and background indivudals
357
map<int, int> it, ib;
359
// deltaaf is the difference of allele frequency we bother to look at
364
const struct option longopts[] =
366
{"version" , 0, 0, 'v'},
367
{"help" , 0, 0, 'h'},
368
{"file" , 1, 0, 'f'},
369
{"target" , 1, 0, 't'},
370
{"background", 1, 0, 'b'},
371
{"deltaaf" , 1, 0, 'd'},
380
iarg = getopt_long(argc, argv, "d:t:b:f:hv", longopts, &index);
389
cerr << "INFO: help: " << endl << endl;
391
cerr << " bFst is a Bayesian approach to Fst. Importantly bFst account for genotype uncertainty in the model using genotype likelihoods." << endl;
392
cerr << " For a more detailed description see: Holsinger et al. Molecular Ecology Vol 11, issue 7 2002. The likelihood function has been " << endl;
393
cerr << " modified to use genotype likelihoods provided by variant callers. There are five free parameters estimated in the model: each " << endl;
394
cerr << " subpopulation's allele frequency and Fis (fixation index, within each subpopulation), a free parameter for the total population\'s " << endl;
395
cerr << " allele frequency, and Fst. " << endl << endl;
397
cerr << "Output : 11 columns : " << endl;
398
cerr << " 1. Seqid " << endl;
399
cerr << " 2. Position " << endl;
400
cerr << " 3. Observed allele frequency in target. " << endl;
401
cerr << " 4. Estimated allele frequency in target. " << endl;
402
cerr << " 5. Observed allele frequency in background. " << endl;
403
cerr << " 6. Estimated allele frequency in background. " << endl;
404
cerr << " 7. Observed allele frequency combined. " << endl;
405
cerr << " 8. Estimated allele frequency in combined. " << endl;
406
cerr << " 9. ML estimate of Fst (mean) " << endl;
407
cerr << " 10. Lower bound of the 95% credible interval " << endl;
408
cerr << " 11. Upper bound of the 95% credible interval " << endl << endl;
411
cerr << "INFO: usage: bFst --target 0,1,2,3,4,5,6,7 --background 11,12,13,16,17,19,22 --file my.vcf --deltaaf 0.1" << endl;
413
cerr << "INFO: required: t,target -- a zero bases comma separated list of target individuals corrisponding to VCF columns" << endl;
414
cerr << "INFO: required: b,background -- a zero bases comma separated list of background individuals corrisponding to VCF columns" << endl;
415
cerr << "INFO: required: f,file a -- a proper formatted VCF file. the FORMAT field MUST contain \"PL\"" << endl;
416
cerr << "INFO: required: d,deltaaf -- skip sites were the difference in allele frequency is less than deltaaf" << endl;
419
cerr << endl << endl;
427
loadIndices(ib, optarg);
428
cerr << "INFO: There are " << ib.size() << " individuals in the target" << endl;
432
loadIndices(it, optarg);
433
cerr << "INFO: There are " << it.size() << " individuals in the background" << endl;
437
cerr << "INFO: File: " << optarg << endl;
442
cerr << "INFO: difference in allele frequency : " << optarg << endl;
444
daf = atof(deltaaf.c_str());
449
cerr << "FATAL: unknown command line option " << optarg << endl << endl ;
450
cerr << "INFO: please use bFst --help " << endl;
459
cerr << "FATAL: did not specify deltaaf" << endl;
460
cerr << "INFO: please use bFst --help " << endl;
465
if(filename == "NA"){
467
cerr << "FATAL: did not specify VCF file" << endl;
468
cerr << "INFO: please use bFst --help " << endl;
473
variantFile.open(filename);
476
if (!variantFile.is_open()) {
478
cerr << "FATAL: could not open VCF file" << endl;
479
cerr << "INFO: please use bFst --help" << endl;
485
cerr << "FATAL: target not specified or less than two indviduals" << endl;
486
cerr << "INFO: please use bFst --help " << endl;
491
cerr << "FATAL: target not specified or less than two indviduals"<< endl;
492
cerr << "INFO: please use bFst --help " << endl;
496
Variant var(variantFile);
498
vector<string> samples = variantFile.sampleNames;
499
int nsamples = samples.size();
501
while (variantFile.getNextVariant(var)) {
503
// biallelic sites naturally
505
if(var.alt.size() > 1){
510
vector < map< string, vector<string> > > target, background, total;
514
for(int nsamp = 0; nsamp < nsamples; nsamp++){
516
map<string, vector<string> > sample = var.samples[ samples[nsamp]];
518
if(sample["GT"].front() != "./."){
519
if(it.find(index) != it.end() ){
520
target.push_back(sample);
521
total.push_back(sample);
524
if(ib.find(index) != ib.end()){
525
background.push_back(sample);
526
total.push_back(sample);
533
if(target.size() < 2 || background.size() < 2 ){
537
pop popt, popb, popTotal;
543
loadPop(target, popt);
544
loadPop(background, popb);
545
loadPop(total, popTotal);
547
if(popt.af == -1 || popb.af == -1){
550
if(popt.af == 1 && popb.af == 1){
553
if(popt.af == 0 && popb.af == 0){
557
double afdiff = abs(popt.af - popb.af);
564
cerr << "INFO: target has " << popt.questionable.size() << " questionable genotypes " << endl;
565
cerr << "INFO: background has " << popb.questionable.size() << " questionable genotypes " << endl;
567
// Parameters- targetAf backgroundAf targetFis backgroundFis totalAf fst
568
vector<double> parameters;
569
parameters.push_back(popt.af);
570
parameters.push_back(popb.af);
571
parameters.push_back(popt.fis);
572
parameters.push_back(popb.fis);
573
parameters.push_back(popTotal.af);
574
parameters.push_back(0.1);
575
parameters.push_back(popTotal.af);
577
double sums [6] = {0};
578
double fsts [10000] ;
580
for(int i = 0; i < 15000; i++){
582
// update each of j parameters
584
for(int j = 0; j < 6; j++ ){
586
updateParameters(popt, popb, parameters, j);
588
sums[j] += parameters[j];
592
fsts[i - 5000] = parameters[5];
594
for(vector<int>::iterator itt = popt.questionable.begin(); itt != popt.questionable.end(); itt++){
595
updateGenotypes(popt, popb, parameters, (*itt), 0);
598
for(vector<int>::iterator itb = popb.questionable.begin(); itb != popb.questionable.end(); itb++){
599
updateGenotypes(popt, popb, parameters, (*itb) , 1);
603
qsort (fsts, sizeof(fsts)/sizeof(fsts[0]), sizeof(fsts[0]), cmp );
605
double lcredint = fsts[500];
606
double hcredint = fsts[9500];
608
cout << var.sequenceName << "\t" << var.position
610
<< "\t" << sums[0]/10000
612
<< "\t" << sums[1]/10000
613
<< "\t" << popTotal.af
614
<< "\t" << sums[4]/10000
615
<< "\t" << sums[5]/10000