15
#include "gpatInfo.hpp"
26
std::string filename ;
29
std::string geneticMapFile ;
32
std::map<int, double> geneticMap;
39
using namespace vcflib;
43
cerr << "INFO: help" << endl;
44
cerr << "INFO: description:" << endl;
45
cerr << " meltEHH provides the data to plot EHH curves. " << endl;
48
cerr << "Output : 4 columns : " << endl;
49
cerr << " 1. seqid " << endl;
50
cerr << " 2. position " << endl;
51
cerr << " 3. EHH " << endl;
52
cerr << " 4. ref or alt [0 == ref] " << endl;
54
cerr << "Usage:" << endl;
56
cerr << " meltEHH --target 0,1,2,3,4,5,6,7 --pos 10 --file my.phased.vcf \\" << endl;
57
cerr << " --region chr1:1-1000 > STDOUT 2> STDERR " << endl << endl;
59
cerr << "Params:" << endl;
60
cerr << " required: t,target <STRING> A zero base comma separated list of target" << endl;
61
cerr << " individuals corresponding to VCF columns " << endl;
62
cerr << " required: r,region <STRING> A tabix compliant genomic range " << endl;
63
cerr << " format: \"seqid:start-end\" or \"seqid\" " << endl;
64
cerr << " required: f,file <STRING> Proper formatted and phased VCF. " << endl;
65
cerr << " required: y,type <STRING> Genotype likelihood format: GT,PL,GL,GP " << endl;
66
cerr << " required: p,position <INT> Variant position to melt. " << endl;
67
cerr << " optional: a,af <DOUBLE> Alternative alleles with frequencies less " << endl;
68
cerr << " than [0.05] are skipped. " << endl;
77
bool gDist(int start, int end, double * gd){
79
if(globalOpts.geneticMap.find(start) == globalOpts.geneticMap.end()){
82
if(globalOpts.geneticMap.find(end) == globalOpts.geneticMap.end()){
85
*gd = abs(globalOpts.geneticMap[start] - globalOpts.geneticMap[end]);
89
void loadGeneticMap(int start, int end){
91
if(globalOpts.geneticMapFile.empty()){
92
std::cerr << "WARNING: No genetic map." << std::endl;
93
std::cerr << "WARNING: A constant genetic distance is being used: 0.001." << std::endl;
97
ifstream featureFile (globalOpts.geneticMapFile.c_str());
102
double lastvalue = 0;
104
if(featureFile.is_open()){
106
while(getline(featureFile, line)){
108
vector<string> region = split(line, "\t");
110
if(region.front() != globalOpts.seqid){
111
std::cerr << "WARNING: seqid MisMatch: " << region.front() << " " << globalOpts.seqid << std::endl;
115
int pos = atoi(region[3].c_str()) ;
116
double cm = atof(region[2].c_str()) ;
118
if(lastpos == 0 && start > pos){
123
int diff = abs(pos - lastpos);
124
double vdiff = abs(lastvalue - cm );
125
double chunk = vdiff/double(diff);
127
double running = lastvalue;
129
for(int i = lastpos; i < pos; i++){
130
globalOpts.geneticMap[i] = running;
146
if(globalOpts.geneticMap.size() < 1){
147
std::cerr << "FATAL: Problem loading genetic map" << std::endl;
153
void clearHaplotypes(string **haplotypes, int ntarget){
154
for(int i= 0; i < ntarget; i++){
155
haplotypes[i][0].clear();
156
haplotypes[i][1].clear();
160
void loadIndices(map<int, int> & index, string set){
162
vector<string> indviduals = split(set, ",");
163
vector<string>::iterator it = indviduals.begin();
165
for(; it != indviduals.end(); it++){
166
index[ atoi( (*it).c_str() ) ] = 1;
170
void countHaps(int nhaps, map<string, int> & targetH,
171
string **haplotypes, int start, int end){
173
for(int i = 0; i < nhaps; i++){
175
std::string h1 = haplotypes[i][0].substr(start, (end - start)) ;
176
std::string h2 = haplotypes[i][1].substr(start, (end - start)) ;
178
if(targetH.find(h1) == targetH.end()){
184
if(targetH.find(h2) == targetH.end()){
193
void computeNs(map<string, int> & targetH, int start,
194
int end, double * sumT, char ref, bool dir){
196
for( map<string, int>::iterator th = targetH.begin();
197
th != targetH.end(); th++){
204
// end is extending ; check first base
206
if( th->first[0] == ref){
208
// std::cerr << "count dat: " << th->first << " " << th->second << " " << ref << " " << dir << endl;
211
*sumT += r8_choose(th->second, 2);
215
// start is extending ; check last base
218
int last = th->first.size() -1;
219
if( th->first[last] == ref ){
220
// std::cerr << "count dat:" << th->first << " " << th->second << " " << ref << " " << dir << endl;
223
*sumT += r8_choose(th->second, 2);
229
bool calcEhh(string **haplotypes, int start,
230
int end, char ref, int nhaps,
231
double * ehh, double div, bool dir){
234
map<string , int> refH;
236
countHaps(nhaps, refH, haplotypes, start, end);
237
computeNs(refH, start, end, &sum, ref, dir );
239
double internalEHH = sum / (r8_choose(div, 2));
242
std::cerr << "FATAL: internal error." << std::endl;
251
int integrate(string ** haplotypes,
252
vector<long int> & pos,
266
// controls the substring madness
286
if(!calcEhh(haplotypes,
298
double delta_gDist = 0.001;
301
gDist(pos[end-1], pos[end], &delta_gDist);
304
gDist(pos[start + 1], pos[start], &delta_gDist);
306
*iHH += ((ehh + ehhRT)/2)*delta_gDist;
309
std::cout << pos[end] << "\t" << ehh << "\t" << ref << "\t" << direction << std::endl;
312
std::cout << pos[start] << "\t" << ehh << "\t" << ref << "\t" << direction << std::endl;
323
void calc(string ** haplotypes, int nhaps,
324
vector<double> & afs, vector<long int> & pos,
325
vector<int> & target, vector<int> & background, string seqid){
327
int maxl = haplotypes[0][0].length();
330
for(int snp = 0; snp < maxl; snp++){
332
if(pos[snp] != globalOpts.pos ){
340
map<string , int> refH;
342
countHaps(nhaps, refH, haplotypes, snp, snp+1);
345
double denomP1 = double(refH["0"]);
346
double denomP2 = double(refH["1"]);
351
std::cout << pos[snp] << "\t" << "1" << "\t" << "0" << "\t" << "0" << std::endl;
355
refFail += integrate(haplotypes, pos, true, maxl, snp, '0', nhaps, &ihhR, denomP1);
357
refFail += integrate(haplotypes, pos, false, maxl, snp, '0', nhaps, &ihhR, denomP1);
359
altFail += integrate(haplotypes, pos, true, maxl, snp, '1', nhaps, &ihhA, denomP2);
361
altFail += integrate(haplotypes, pos, false, maxl, snp, '1', nhaps, &ihhA, denomP2);
366
void loadPhased(string **haplotypes, genotype * pop, int ntarget){
370
for(vector<string>::iterator ind = pop->gts.begin(); ind != pop->gts.end(); ind++){
372
vector< string > gs = split(g, "|");
373
haplotypes[indIndex][0].append(gs[0]);
374
haplotypes[indIndex][1].append(gs[1]);
379
int main(int argc, char** argv) {
381
globalOpts.threads = 1 ;
382
globalOpts.af = 0.05;
384
// zero based index for the target and background indivudals
386
map<int, int> it, ib;
388
const struct option longopts[] =
390
{"version" , 0, 0, 'v'},
391
{"help" , 0, 0, 'h'},
392
{"file" , 1, 0, 'f'},
393
{"target" , 1, 0, 't'},
394
{"region" , 1, 0, 'r'},
396
{"type" , 1, 0, 'y'},
397
{"threads" , 1, 0, 'x'},
408
iarg = getopt_long(argc, argv, "a:x:g:y:r:d:t:b:f:p:hv", longopts, &findex);
414
globalOpts.pos = atoi(optarg);
420
globalOpts.af = atof(optarg);
425
globalOpts.threads = atoi(optarg);
430
globalOpts.geneticMapFile = optarg;
445
globalOpts.type = optarg;
450
loadIndices(it, optarg);
451
cerr << "INFO: there are " << it.size() << " individuals in the target" << endl;
452
cerr << "INFO: target ids: " << optarg << endl;
457
cerr << "INFO: file: " << optarg << endl;
458
globalOpts.filename = optarg;
463
cerr << "INFO: set seqid region to : " << optarg << endl;
464
globalOpts.region = optarg;
472
omp_set_num_threads(globalOpts.threads);
474
map<string, int> okayGenotypeLikelihoods;
475
okayGenotypeLikelihoods["PL"] = 1;
476
okayGenotypeLikelihoods["GL"] = 1;
477
okayGenotypeLikelihoods["GP"] = 1;
478
okayGenotypeLikelihoods["GT"] = 1;
481
// add an option for dumping
483
// for(std::map<int, double>::iterator gm = geneticMap.begin(); gm != geneticMap.end(); gm++){
484
// cerr << "pos: " << gm->first << " cm: " << gm->second << endl;
487
if(globalOpts.type.empty()){
488
cerr << "FATAL: failed to specify genotype likelihood format : PL or GL" << endl;
492
if(okayGenotypeLikelihoods.find(globalOpts.type) == okayGenotypeLikelihoods.end()){
493
cerr << "FATAL: genotype likelihood is incorrectly formatted, only use: PL or GL" << endl;
498
if(globalOpts.filename.empty()){
499
cerr << "FATAL: did not specify a file" << endl;
505
cerr << "FATAL: target option is required -- or -- less than two individuals in target\n";
510
// using vcflib; thanksErik
512
VariantCallFile variantFile;
514
variantFile.open(globalOpts.filename);
516
if(globalOpts.region.empty()){
517
cerr << "FATAL: region required" << endl;
520
if(! variantFile.setRegion(globalOpts.region)){
521
cerr <<"FATAL: unable to set region" << endl;
525
if (!variantFile.is_open()) {
529
Variant var( variantFile );
530
vector<int> target_h, background_h;
536
vector<string> samples = variantFile.sampleNames;
537
int nsamples = samples.size();
539
for(vector<string>::iterator samp = samples.begin(); samp != samples.end(); samp++){
541
string sampleName = (*samp);
543
if(it.find(index) != it.end() ){
544
target_h.push_back(indexi);
551
vector<long int> positions;
555
string **haplotypes = new string*[target_h.size()];
556
for (int i = 0; i < target_h.size(); i++) {
557
haplotypes[i] = new string[2];
561
while (variantFile.getNextVariant(var)) {
563
globalOpts.seqid = var.sequenceName;
566
cerr << "FATAL: Found an unphased variant. All genotypes must be phased!" << endl;
570
if(var.alleles.size() > 2){
574
vector < map< string, vector<string> > > target, background, total;
578
for(int nsamp = 0; nsamp < nsamples; nsamp++){
580
map<string, vector<string> > sample = var.samples[ samples[nsamp]];
582
if(it.find(sindex) != it.end() ){
583
target.push_back(sample);
588
genotype * populationTarget ;
590
if(globalOpts.type == "PL"){
591
populationTarget = new pl();
593
if(globalOpts.type == "GL"){
594
populationTarget = new gl();
596
if(globalOpts.type == "GP"){
597
populationTarget = new gp();
599
if(globalOpts.type == "GT"){
600
populationTarget = new gt();
603
populationTarget->loadPop(target, var.sequenceName, var.position);
605
if(populationTarget->af <= globalOpts.af
606
|| populationTarget->nref < 2
607
|| populationTarget->nalt < 2){
608
delete populationTarget;
611
positions.push_back(var.position);
612
afs.push_back(populationTarget->af);
613
loadPhased(haplotypes, populationTarget, populationTarget->gts.size());
615
populationTarget = NULL;
616
delete populationTarget;
619
if(!globalOpts.geneticMapFile.empty()){
620
cerr << "INFO: loading genetics map" << endl;
621
loadGeneticMap(positions.front(), positions.back());
622
cerr << "INFO: finished loading genetics map" << endl;
625
calc(haplotypes, target_h.size(), afs, positions,
626
target_h, background_h, globalOpts.seqid);
627
clearHaplotypes(haplotypes, target_h.size());