17
using namespace vcflib;
18
void printVersion(void){
19
cerr << "INFO: version 1.0.1 ; date: April 2014 ; author: Zev Kronenberg; email : zev.kronenberg@utah.edu " << endl;
25
cerr << "INFO: help" << endl;
26
cerr << "INFO: description:" << endl;
27
cerr << " plotHaps provides the formatted output that can be used with \'bin/plotHaplotypes.R\'. " << endl;
29
cerr << "Output : haplotype matrix and positions" << endl << endl;
31
cerr << "INFO: plotHaps --target 0,1,2,3,4,5,6,7 --file my.phased.vcf.gz " << endl << endl;
32
cerr << "INFO: required: t,target -- argument: a zero base comma separated list of target individuals corrisponding to VCF column s " << endl;
33
cerr << "INFO: required: r,region -- argument: a tabix compliant genomic range : \"seqid:start-end\" or \"seqid\" " << endl;
34
cerr << "INFO: required: f,file -- argument: proper formatted phased VCF file " << endl;
35
cerr << "INFO: required: y,type -- argument: genotype likelihood format: PL,GP,GP " << endl;
43
void clearHaplotypes(string **haplotypes, int ntarget){
44
for(int i= 0; i < ntarget; i++){
45
haplotypes[i][0].clear();
46
haplotypes[i][1].clear();
50
void loadIndices(map<int, int> & index, string set){
52
vector<string> indviduals = split(set, ",");
53
vector<string>::iterator it = indviduals.begin();
55
for(; it != indviduals.end(); it++){
56
index[ atoi( (*it).c_str() ) ] = 1;
60
void calc(string **haplotypes, int nhaps, vector<double> afs, vector<long int> pos, vector<int> & target, vector<int> & background, string seqid){
62
for(int snp = 0; snp < haplotypes[0][0].length(); snp++){
74
while( ehhA > 0.05 && ehhR > 0.05 ) {
82
if(end == haplotypes[0][0].length() - 1){
86
map<string , int> targetH;
93
for(int i = 0; i < nhaps; i++){
94
targetH[ haplotypes[i][0].substr(start, (end - start)) ]++;
95
targetH[ haplotypes[i][1].substr(start, (end - start)) ]++;
97
for( map<string, int>::iterator th = targetH.begin(); th != targetH.end(); th++){
98
if( (*th).first.substr((end-start)/2, 1) == "1"){
99
sumaT += r8_choose(th->second, 2);
103
sumrT += r8_choose(th->second, 2);
108
ehhA = sumaT / (r8_choose(naltT, 2));
109
ehhR = sumrT / (r8_choose(nrefT, 2));
114
cout << seqid << "\t" << pos[snp] << "\t" << afs[snp] << "\t" << iHSA << "\t" << iHSR << "\t" << iHSA/iHSR << endl;
118
double EHH(string **haplotypes, int nhaps){
120
map<string , int> hapcounts;
122
for(int i = 0; i < nhaps; i++){
123
hapcounts[ haplotypes[i][0] ]++;
124
hapcounts[ haplotypes[i][1] ]++;
130
for( map<string, int>::iterator it = hapcounts.begin(); it != hapcounts.end(); it++){
132
sum += r8_choose(it->second, 2);
135
double max = (sum / r8_choose(nh, 2));
141
void loadPhased(string **haplotypes, genotype * pop, int ntarget){
145
for(vector<string>::iterator ind = pop->gts.begin(); ind != pop->gts.end(); ind++){
147
vector< string > gs = split(g, "|");
148
haplotypes[indIndex][0].append(gs[0]);
149
haplotypes[indIndex][1].append(gs[1]);
154
void printHaplotypes(string **haps, vector<int> target, vector<long int> pos){
155
for(int snp = 0; snp < haps[0][1].length(); snp++){
156
cout << pos[snp] << "\t" ;
157
for(int ind = 0; ind < target.size(); ind++){
158
cout << haps[target[ind]][0].substr(snp , 1) << "\t";
159
cout << haps[target[ind]][1].substr(snp , 1) << "\t";
165
int main(int argc, char** argv) {
167
// set the random seed for MCMC
169
srand((unsigned)time(NULL));
173
string filename = "NA";
175
// set region to scaffold
177
string region = "NA";
179
// using vcflib; thanks to Erik Garrison
181
VariantCallFile variantFile;
183
// zero based index for the target and background indivudals
185
map<int, int> it, ib;
187
// deltaaf is the difference of allele frequency we bother to look at
189
// ancestral state is set to zero by default
200
const struct option longopts[] =
202
{"version" , 0, 0, 'v'},
203
{"help" , 0, 0, 'h'},
204
{"file" , 1, 0, 'f'},
205
{"target" , 1, 0, 't'},
206
{"region" , 1, 0, 'r'},
207
{"type" , 1, 0, 'y'},
216
iarg = getopt_long(argc, argv, "y:r:t:f:hv", longopts, &findex);
228
loadIndices(it, optarg);
229
cerr << "INFO: there are " << it.size() << " individuals in the target" << endl;
230
cerr << "INFO: target ids: " << optarg << endl;
233
cerr << "INFO: file: " << optarg << endl;
237
cerr << "INFO: set seqid region to : " << optarg << endl;
245
map<string, int> okayGenotypeLikelihoods;
246
okayGenotypeLikelihoods["PL"] = 1;
247
okayGenotypeLikelihoods["GL"] = 1;
248
okayGenotypeLikelihoods["GP"] = 1;
249
okayGenotypeLikelihoods["GT"] = 1;
253
cerr << "FATAL: failed to specify genotype likelihood format : PL or GL" << endl;
257
if(okayGenotypeLikelihoods.find(type) == okayGenotypeLikelihoods.end()){
258
cerr << "FATAL: genotype likelihood is incorrectly formatted, only use: PL or GL" << endl;
263
if(filename == "NA"){
264
cerr << "FATAL: did not specify a file" << endl;
269
variantFile.open(filename);
271
if (!variantFile.is_open()) {
272
cerr << "FATAL: could not open VCF for reading" << endl;
278
if(! variantFile.setRegion(region)){
279
cerr <<"FATAL: unable to set region" << endl;
284
cerr << "FATAL: must specifiy a region" << endl;
289
Variant var(variantFile);
291
vector<string> samples = variantFile.sampleNames;
292
int nsamples = samples.size();
294
vector<int> target_h, background_h;
299
for(vector<string>::iterator samp = samples.begin(); samp != samples.end(); samp++){
301
if(it.find(index) != it.end() ){
302
target_h.push_back(indexi);
308
vector<long int> positions;
312
string **haplotypes = new string*[target_h.size()];
313
for (int i = 0; i < target_h.size(); i++) {
314
haplotypes[i] = new string[2];
317
string currentSeqid = "NA";
320
while (variantFile.getNextVariant(var)) {
323
cerr << "FATAL: Found an unphased variant. All genotypes must be phased!" << endl;
327
if(var.alt.size() > 1){
332
vector < map< string, vector<string> > > target, background, total;
336
for(int nsamp = 0; nsamp < nsamples; nsamp++){
338
map<string, vector<string> > sample = var.samples[ samples[nsamp]];
340
if(it.find(sindex) != it.end() ){
341
target.push_back(sample);
346
genotype * populationTarget ;
349
populationTarget = new pl();
352
populationTarget = new gl();
355
populationTarget = new gp();
358
populationTarget = new gt();
361
populationTarget->loadPop(target, var.sequenceName, var.position);
363
positions.push_back(var.position);
364
afs.push_back(populationTarget->af);
365
loadPhased(haplotypes, populationTarget, populationTarget->gts.size());
368
printHaplotypes( haplotypes, target_h, positions);