15
#include "gpatInfo.hpp"
18
using namespace vcflib;
24
cerr << "INFO: help" << endl;
25
cerr << "INFO: description:" << endl;
26
cerr << " wcFst is Weir & Cockerham's Fst for two populations. Negative values are VALID, " << endl;
27
cerr << " they are sites which can be treated as zero Fst. For more information see Evolution, Vol. 38 N. 6 Nov 1984. " << endl;
28
cerr << " Specifically wcFst uses equations 1,2,3,4. " << endl << endl;
30
cerr << "Output : 3 columns : " << endl;
31
cerr << " 1. seqid " << endl;
32
cerr << " 2. position " << endl;
33
cerr << " 3. target allele frequency " << endl;
34
cerr << " 4. background allele frequency " << endl;
35
cerr << " 5. wcFst " << endl << endl;
37
cerr << "INFO: usage: wcFst --target 0,1,2,3,4,5,6,7 --background 11,12,13,16,17,19,22 --file my.vcf --deltaaf 0.1 --type PL " << endl;
39
cerr << "INFO: required: t,target -- argument: a zero based comma separated list of target individuals corrisponding to VCF columns " << endl;
40
cerr << "INFO: required: b,background -- argument: a zero based comma separated list of background individuals corrisponding to VCF columns " << endl;
41
cerr << "INFO: required: f,file -- argument: proper formatted VCF " << endl;
42
cerr << "INFO: required, y,type -- argument: genotype likelihood format; genotype : GT,GL,PL,GP " << endl;
43
cerr << "INFO: optional: r,region -- argument: a tabix compliant genomic range: seqid or seqid:start-end " << endl;
44
cerr << "INFO: optional: d,deltaaf -- argument: skip sites where the difference in allele frequencies is less than deltaaf, default is zero " << endl;
50
double bound(double v){
60
void loadIndices(map<int, int> & index, string set){
62
vector<string> indviduals = split(set, ",");
64
vector<string>::iterator it = indviduals.begin();
66
for(; it != indviduals.end(); it++){
67
index[ atoi( (*it).c_str() ) ] = 1;
72
int main(int argc, char** argv) {
74
// set the random seed for MCMC
76
srand((unsigned)time(NULL));
80
string filename = "NA";
82
// set region to scaffold
86
// using vcflib; thanks to Erik Garrison
88
VariantCallFile variantFile;
90
// zero based index for the target and background indivudals
94
// deltaaf is the difference of allele frequency we bother to look at
99
// genotype likelihood format
104
const struct option longopts[] =
106
{"version" , 0, 0, 'v'},
107
{"help" , 0, 0, 'h'},
108
{"file" , 1, 0, 'f'},
109
{"target" , 1, 0, 't'},
110
{"background", 1, 0, 'b'},
111
{"deltaaf" , 1, 0, 'd'},
112
{"type" , 1, 0, 'y'},
113
{"region" , 1, 0, 'r'},
122
iarg = getopt_long(argc, argv, "y:r:d:t:b:f:chv", longopts, &index);
133
loadIndices(it, optarg);
134
cerr << "INFO: there are " << it.size() << " individuals in the target" << endl;
135
cerr << "INFO: target ids: " << optarg << endl;
138
loadIndices(ib, optarg);
139
cerr << "INFO: there are " << ib.size() << " individuals in the background" << endl;
140
cerr << "INFO: background ids: " << optarg << endl;
143
cerr << "INFO: file: " << optarg << endl;
147
cerr << "INFO: only scoring sites where the allele frequency difference is greater than: " << optarg << endl;
149
daf = atof(deltaaf.c_str());
153
cerr << "INFO: setting genotype likelihood format to: " << type << endl;
156
cerr << "INFO: set seqid region to : " << optarg << endl;
164
if(filename == "NA"){
165
cerr << "FATAL: did not specify a required option: file" << endl;
170
variantFile.open(filename);
173
if(! variantFile.setRegion(region)){
174
cerr << "FATAL: unable to set region" << endl;
179
if (!variantFile.is_open()) {
180
cerr << "FATAL: could not open VCF for reading" << endl;
185
map<string, int> okayGenotypeLikelihoods;
186
okayGenotypeLikelihoods["PL"] = 1;
187
okayGenotypeLikelihoods["GL"] = 1;
188
okayGenotypeLikelihoods["GP"] = 1;
189
okayGenotypeLikelihoods["GT"] = 1;
192
cerr << "FATAL: failed to specify genotype likelihood format : PL or GL" << endl;
196
if(okayGenotypeLikelihoods.find(type) == okayGenotypeLikelihoods.end()){
197
cerr << "FATAL: genotype likelihood is incorrectly formatted, only use: PL or GL" << endl;
202
Variant var(variantFile);
204
vector<string> samples = variantFile.sampleNames;
205
int nsamples = samples.size();
207
while (variantFile.getNextVariant(var)) {
209
// biallelic sites naturally
211
if(var.alt.size() > 1){
215
vector < map< string, vector<string> > > target, background, total;
219
for(int nsamp = 0; nsamp < nsamples; nsamp++){
221
map<string, vector<string> > sample = var.samples[ samples[nsamp]];
223
if(sample["GT"].front() != "./."){
224
if(it.find(index) != it.end() ){
225
target.push_back(sample);
227
if(ib.find(index) != ib.end()){
228
background.push_back(sample);
235
if(target.size() < 5 || background.size() < 5){
239
genotype * populationTarget ;
240
genotype * populationBackground ;
243
populationTarget = new pl();
244
populationBackground = new pl();
247
populationTarget = new gl();
248
populationBackground = new gl();
251
populationTarget = new gp();
252
populationBackground = new gp();
255
populationTarget = new gt();
256
populationBackground = new gt();
259
populationTarget->loadPop(target, var.sequenceName, var.position);
260
populationBackground->loadPop(background, var.sequenceName, var.position);
262
if(populationTarget->af == -1 || populationBackground->af == -1){
263
delete populationTarget;
264
delete populationBackground;
267
if(populationTarget->af == 1 && populationBackground->af == 1){
268
delete populationTarget;
269
delete populationBackground;
272
if(populationTarget->af == 0 && populationBackground->af == 0){
273
delete populationTarget;
274
delete populationBackground;
278
double afdiff = abs(populationTarget->af - populationBackground->af);
281
delete populationTarget;
282
delete populationBackground;
286
// pg 1360 B.S Weir and C.C. Cockerham 1984
287
double nbar = ( populationTarget->ngeno / 2 ) + (populationBackground->ngeno / 2);
290
// special case of only two populations
292
nc -= (pow(populationTarget->ngeno,2)/rn);
293
nc -= (pow(populationBackground->ngeno,2)/rn);
294
// average sample frequency
295
double pbar = (populationTarget->af + populationBackground->af) / 2;
297
// sample variance of allele A frequences over the population
300
s2 += ((populationTarget->ngeno * pow(populationTarget->af - pbar, 2))/nbar);
301
s2 += ((populationBackground->ngeno * pow(populationBackground->af - pbar, 2))/nbar);
303
// average heterozygosity
305
double hbar = (populationTarget->hfrq + populationBackground->hfrq) / 2;
308
double pvar = pbar * (1 - pbar);
312
double avar1 = nbar / nc;
313
double avar2 = 1 / (nbar -1) ;
314
double avar3 = pvar - (0.5*s2) - (0.25*hbar);
315
double avar = avar1 * (s2 - (avar2 * avar3));
317
double bvar1 = nbar / (nbar - 1);
318
double bvar2 = pvar - (0.5*s2) - (((2*nbar -1)/(4*nbar))*hbar);
319
double bvar = bvar1 * bvar2;
321
double cvar = 0.5*hbar;
323
double fst = avar / (avar+bvar+cvar);
325
cout << var.sequenceName << "\t" << var.position << "\t" << populationTarget->af << "\t" << populationBackground->af << "\t" << fst << endl ;
327
delete populationTarget;
328
delete populationBackground;