8
using namespace vcflib;
10
// TODO fix this for multi-allelic!!!!
11
string genotypeSpec(map<int, int>& genotype) {
13
if (isNull(genotype)) {
15
} else if (isHom(genotype)) {
16
if (hasNonRef(genotype)) {
27
int main(int argc, char** argv) {
30
cerr << "usage: " << argv[0] << " <other-genotype-tag> <vcf file>" << endl
31
<< "adds statistics to the INFO field of the vcf file describing the" << endl
32
<< "amount of discrepancy between the genotypes (GT) in the vcf file and the" << endl
33
<< "genotypes reported in the <other-genotype-tag>. use this after" << endl
34
<< "vcfannotategenotypes to get correspondence statistics for two vcfs." << endl;
38
string otherGenoTag = argv[1];
39
string filename = argv[2];
41
VariantCallFile variantFile;
42
if (filename == "-") {
43
variantFile.open(std::cin);
45
variantFile.open(filename);
48
if (!variantFile.is_open()) {
54
specs.push_back("AA_AA");
55
specs.push_back("AA_AR");
56
specs.push_back("AA_RR");
57
specs.push_back("AA_NN");
59
specs.push_back("AR_AA");
60
specs.push_back("AR_AR");
61
specs.push_back("AR_RR");
62
specs.push_back("AR_NN");
64
specs.push_back("RR_AA");
65
specs.push_back("RR_AR");
66
specs.push_back("RR_RR");
67
specs.push_back("RR_NN");
69
specs.push_back("NN_AA");
70
specs.push_back("NN_AR");
71
specs.push_back("NN_RR");
72
specs.push_back("NN_NN");
75
for (vector<string>::iterator spec = specs.begin(); spec != specs.end(); ++spec) {
76
string line = "##INFO=<ID=" + otherGenoTag + ".genotypes." + *spec
77
+ ",Number=1,Type=Integer,Description=\"Number of genotypes with "
78
+ *spec + " relationship with " + otherGenoTag + "\">";
79
variantFile.addHeaderLine(line);
84
line = "##INFO=<ID=" + otherGenoTag + ".genotypes.count,Number=1,Type=Integer,Description=\"Count of genotypes under comparison.\">";
85
variantFile.addHeaderLine(line);
87
line = "##INFO=<ID=" + otherGenoTag + ".genotypes.alternate_count,Number=1,Type=Integer,Description=\"Count of alternate genotypes in the first file.\">";
88
variantFile.addHeaderLine(line);
90
line = "##INFO=<ID=" + otherGenoTag
91
+ ".site.alternate_positive_discrepancy,Number=1,Type=Integer,Description=\"Estimated positive discrepancy rate of "
92
+ otherGenoTag + " genotypes, where positive discrepancies are all cases where an alternate allele is called GT "
93
+ " but none is represented in " + otherGenoTag + " or " + otherGenoTag + " is null/no-call\">";
94
variantFile.addHeaderLine(line);
96
line = "##INFO=<ID=" + otherGenoTag
97
+ ".site.alternate_negative_discrepancy,Number=1,Type=Integer,Description=\"Estimated negative discrepancy rate of "
98
+ otherGenoTag + " genotypes, where negative discrepancies are all cases where no alternate allele is called in "
99
+ " GT but an alternate is represented in " + otherGenoTag + ", including no-calls or partly null genotypes\">";
100
variantFile.addHeaderLine(line);
102
line = "##INFO=<ID=" + otherGenoTag
103
+ ".site.alternate_null_discrepancy,Number=1,Type=Integer,Description=\"Estimated null discrepancy rate of "
104
+ otherGenoTag + " genotypes, where null discrepancies are all cases where GT is specified and contains an alternate but "
105
+ otherGenoTag + " is null. Cases where GT is null or partly null are excluded.\">";
106
variantFile.addHeaderLine(line);
108
line = "##INFO=<ID=" + otherGenoTag
109
+ ".site.call_discrepancy,Number=1,Type=Integer,Description=\"Estimated call discrepancy rate of "
110
+ otherGenoTag + " genotypes (het->hom, hom->het) between " + otherGenoTag + " and GT\">";
111
variantFile.addHeaderLine(line);
113
line = "##INFO=<ID=" + otherGenoTag
114
+ ".site.call_concordance,Number=1,Type=Integer,Description=\"Estimated call concorndance rate of "
115
+ otherGenoTag + " genotypes between " + otherGenoTag + " and GT\">";
116
variantFile.addHeaderLine(line);
118
line = "##INFO=<ID=" + otherGenoTag
119
+ ".site.non_reference_discrepancy,Number=1,Type=Float,Description=\"Estimated non-reference discrepancy relative to "
120
+ otherGenoTag + " genotypes,\">";
121
variantFile.addHeaderLine(line);
123
line = "##INFO=<ID=" + otherGenoTag
124
+ ".site.non_reference_discrepancy.count,Number=1,Type=Int,Description=\"non-reference discrepancy normalizer relative to "
125
+ otherGenoTag + " genotypes,\">";
126
variantFile.addHeaderLine(line);
128
line = "##INFO=<ID=" + otherGenoTag
129
+ ".site.non_reference_discrepancy.normalizer,Number=1,Type=Int,Description=\"non-reference discrepancy count relative to "
130
+ otherGenoTag + " genotypes,\">";
131
variantFile.addHeaderLine(line);
133
line = "##INFO=<ID=" + otherGenoTag
134
+ ".site.non_reference_sensitivity,Number=1,Type=Float,Description=\"Estimated non-reference sensitivity relative to "
135
+ otherGenoTag + " genotypes,\">";
136
variantFile.addHeaderLine(line);
138
line = "##INFO=<ID=" + otherGenoTag
139
+ ".site.non_reference_sensitivity.count,Number=1,Type=Int,Description=\"non-reference sensitivity normalizer relative to "
140
+ otherGenoTag + " genotypes,\">";
141
variantFile.addHeaderLine(line);
143
line = "##INFO=<ID=" + otherGenoTag
144
+ ".site.non_reference_sensitivity.normalizer,Number=1,Type=Int,Description=\"non-reference sensitivity count relative to "
145
+ otherGenoTag + " genotypes,\">";
146
variantFile.addHeaderLine(line);
148
cout << variantFile.header << endl;
150
Variant var(variantFile);
152
while (variantFile.getNextVariant(var)) {
154
//cout << "next: " << var << endl;
155
// for each sample, check GT against <other-genotype-tag>
156
// tally stats, and append to info
157
map<string, map<string, vector<string> > >::iterator s = var.samples.begin();
158
map<string, map<string, vector<string> > >::iterator sEnd = var.samples.end();
160
map<string, int> genotypeComparisonCounts;
161
int gtCount = var.samples.size();
162
int gtAltCount = 0; // number of alternate-containing genotypes in the first file
163
int pdCount = 0; // positive discrepancy count
164
int ndCount = 0; // negative discrepancy count
165
int nnCount = 0; // null discrepancy count
166
int cdCount = 0; // call discrepancy count
167
int ccCount = 0; // call concordance count
168
int nrdCount = 0; // non-reference discrepancy count
169
int nrdNormalizer = 0; // divisor for nrd rate
170
int nrsCount = 0; // non-reference sensitivity count
171
int nrsNormalizer = 0; // divisor for nrs rate
173
for (; s != sEnd; ++s) {
174
map<string, vector<string> >& sample = s->second;
175
const string& name = s->first;
177
// decompose genotypes into counts of strings
178
// to facilitate comparison
181
if (sample.find("GT") == sample.end()) {
184
gtA = sample["GT"].front();
188
if (sample.find(otherGenoTag) == sample.end()) {
191
gtB = sample[otherGenoTag].front();
195
map<int, int> genotypeA = decomposeGenotype(gtA);
196
map<int, int> genotypeB = decomposeGenotype(gtB);
198
string gtspecA = genotypeSpec(genotypeA);
199
string gtspecB = genotypeSpec(genotypeB);
200
//cout << gtA << " " << gtB << endl;
201
//cout << gtspecA << " " << gtspecB << endl;
202
++genotypeComparisonCounts[gtspecA + "_" + gtspecB];
204
if (hasNonRef(genotypeA)) {
208
if (genotypeA != genotypeB) {
209
if (isNull(genotypeA)) {
210
// TODO handle this somehow, maybe via a different flag?
211
if (!isNull(genotypeB)) {
212
++nnCount; // null discrepancy, the second set makes a call, this one does not
214
} else if (hasNonRef(genotypeA)) {
215
if (!isNull(genotypeB) && hasNonRef(genotypeB)) { // they cannot be the same, but they both represent an alternate
216
++cdCount; // the calls are discrepant
217
} else { // the other call does not have an alternate
220
if (isNull(genotypeB)) {
224
} else { // the current genotype has no non-ref alternate
225
if (!isNull(genotypeB) && hasNonRef(genotypeB)) {
228
if (isNull(genotypeB)) {
233
if (!isNull(genotypeA)) {
239
if (!(isNull(genotypeA) || isNull(genotypeB))
240
&& !(isHomRef(genotypeA) && isHomRef(genotypeB))) {
242
if (genotypeA != genotypeB) {
247
if (!(isNull(genotypeB) || isHomRef(genotypeB))) {
249
if (!(isNull(genotypeA) || isHomRef(genotypeA))) {
256
for (map<string, int>::iterator g = genotypeComparisonCounts.begin();
257
g != genotypeComparisonCounts.end(); ++g) {
260
vector<string>& t = var.info[otherGenoTag + ".genotypes." + g->first];
261
t.clear(); t.push_back(c.str());
266
var.info[otherGenoTag + ".genotypes.count"].push_back(gtc.str());
270
var.info[otherGenoTag + ".genotypes.alternate_count"].push_back(gtac.str());
274
var.info[otherGenoTag + ".site.alternate_positive_discrepancy"].push_back(pd.str());
278
var.info[otherGenoTag + ".site.alternate_negative_discrepancy"].push_back(nd.str());
282
var.info[otherGenoTag + ".site.alternate_null_discrepancy"].push_back(nn.str());
286
var.info[otherGenoTag + ".site.call_discrepancy"].push_back(cd.str());
290
var.info[otherGenoTag + ".site.call_concordance"].push_back(cc.str());
294
var.info[otherGenoTag + ".site.non_reference_discrepancy.count"].push_back(nrdc.str());
297
nrdn << nrdNormalizer;
298
var.info[otherGenoTag + ".site.non_reference_discrepancy.normalizer"].push_back(nrdn.str());
300
if (nrdNormalizer > 0) {
302
nrd << (double) nrdCount / (double) nrdNormalizer;
303
var.info[otherGenoTag + ".site.non_reference_discrepancy"].push_back(nrd.str());
308
var.info[otherGenoTag + ".site.non_reference_sensitivity.count"].push_back(nrsc.str());
311
nrsn << nrsNormalizer;
312
var.info[otherGenoTag + ".site.non_reference_sensitivity.normalizer"].push_back(nrsn.str());
314
if (nrsNormalizer > 0) {
316
nrs << (double) nrsCount / (double) nrsNormalizer;
317
var.info[otherGenoTag + ".site.non_reference_sensitivity"].push_back(nrs.str());