~ubuntu-branches/debian/sid/libvcflib/sid

« back to all changes in this revision

Viewing changes to src/vcfgenotypecompare.cpp

  • Committer: Package Import Robot
  • Author(s): Andreas Tille
  • Date: 2016-09-16 15:52:29 UTC
  • Revision ID: package-import@ubuntu.com-20160916155229-24mxrntfylvsshsg
Tags: upstream-1.0.0~rc1+dfsg
ImportĀ upstreamĀ versionĀ 1.0.0~rc1+dfsg

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
#include "Variant.h"
 
2
#include "split.h"
 
3
#include <string>
 
4
#include <iostream>
 
5
#include <sstream>
 
6
 
 
7
using namespace std;
 
8
using namespace vcflib;
 
9
 
 
10
// TODO fix this for multi-allelic!!!!
 
11
string genotypeSpec(map<int, int>& genotype) {
 
12
    string gtspec;
 
13
    if (isNull(genotype)) {
 
14
        gtspec = "NN";
 
15
    } else if (isHom(genotype)) {
 
16
        if (hasNonRef(genotype)) {
 
17
            gtspec = "AA";
 
18
        } else {
 
19
            gtspec = "RR";
 
20
        }
 
21
    } else {
 
22
        gtspec = "AR";
 
23
    }
 
24
    return gtspec;
 
25
}
 
26
 
 
27
int main(int argc, char** argv) {
 
28
 
 
29
    if (argc != 3) {
 
30
        cerr << "usage: " << argv[0] << " <other-genotype-tag> <vcf file>" << endl
 
31
             << "adds statistics to the INFO field of the vcf file describing the" << endl
 
32
             << "amount of discrepancy between the genotypes (GT) in the vcf file and the" << endl
 
33
             << "genotypes reported in the <other-genotype-tag>.  use this after" << endl
 
34
             << "vcfannotategenotypes to get correspondence statistics for two vcfs." << endl;
 
35
        return 1;
 
36
    }
 
37
 
 
38
    string otherGenoTag = argv[1];
 
39
    string filename = argv[2];
 
40
 
 
41
    VariantCallFile variantFile;
 
42
    if (filename == "-") {
 
43
        variantFile.open(std::cin);
 
44
    } else {
 
45
        variantFile.open(filename);
 
46
    }
 
47
 
 
48
    if (!variantFile.is_open()) {
 
49
        return 1;
 
50
    }
 
51
 
 
52
    vector<string> specs;
 
53
 
 
54
    specs.push_back("AA_AA");
 
55
    specs.push_back("AA_AR");
 
56
    specs.push_back("AA_RR");
 
57
    specs.push_back("AA_NN");
 
58
 
 
59
    specs.push_back("AR_AA");
 
60
    specs.push_back("AR_AR");
 
61
    specs.push_back("AR_RR");
 
62
    specs.push_back("AR_NN");
 
63
 
 
64
    specs.push_back("RR_AA");
 
65
    specs.push_back("RR_AR");
 
66
    specs.push_back("RR_RR");
 
67
    specs.push_back("RR_NN");
 
68
 
 
69
    specs.push_back("NN_AA");
 
70
    specs.push_back("NN_AR");
 
71
    specs.push_back("NN_RR");
 
72
    specs.push_back("NN_NN");
 
73
 
 
74
 
 
75
    for (vector<string>::iterator spec = specs.begin(); spec != specs.end(); ++spec) {
 
76
        string line = "##INFO=<ID=" + otherGenoTag + ".genotypes." + *spec
 
77
            + ",Number=1,Type=Integer,Description=\"Number of genotypes with "
 
78
            + *spec + " relationship with " + otherGenoTag + "\">";
 
79
        variantFile.addHeaderLine(line);
 
80
    }
 
81
 
 
82
    string line;
 
83
 
 
84
    line = "##INFO=<ID=" + otherGenoTag + ".genotypes.count,Number=1,Type=Integer,Description=\"Count of genotypes under comparison.\">";
 
85
    variantFile.addHeaderLine(line);
 
86
 
 
87
    line = "##INFO=<ID=" + otherGenoTag + ".genotypes.alternate_count,Number=1,Type=Integer,Description=\"Count of alternate genotypes in the first file.\">";
 
88
    variantFile.addHeaderLine(line);
 
89
 
 
90
    line = "##INFO=<ID=" + otherGenoTag
 
91
        + ".site.alternate_positive_discrepancy,Number=1,Type=Integer,Description=\"Estimated positive discrepancy rate of "
 
92
        + otherGenoTag + " genotypes, where positive discrepancies are all cases where an alternate allele is called GT "
 
93
        + " but none is represented in " + otherGenoTag + " or " + otherGenoTag + " is null/no-call\">";
 
94
    variantFile.addHeaderLine(line);
 
95
 
 
96
    line = "##INFO=<ID=" + otherGenoTag
 
97
        + ".site.alternate_negative_discrepancy,Number=1,Type=Integer,Description=\"Estimated negative discrepancy rate of "
 
98
        + otherGenoTag + " genotypes, where negative discrepancies are all cases where no alternate allele is called in "
 
99
        + " GT but an alternate is represented in " + otherGenoTag + ", including no-calls or partly null genotypes\">";
 
100
    variantFile.addHeaderLine(line);
 
101
 
 
102
    line = "##INFO=<ID=" + otherGenoTag
 
103
        + ".site.alternate_null_discrepancy,Number=1,Type=Integer,Description=\"Estimated null discrepancy rate of "
 
104
        + otherGenoTag + " genotypes, where null discrepancies are all cases where GT is specified and contains an alternate but "
 
105
        + otherGenoTag + " is null.  Cases where GT is null or partly null are excluded.\">";
 
106
    variantFile.addHeaderLine(line);
 
107
 
 
108
    line = "##INFO=<ID=" + otherGenoTag
 
109
        + ".site.call_discrepancy,Number=1,Type=Integer,Description=\"Estimated call discrepancy rate of "
 
110
        + otherGenoTag + " genotypes (het->hom, hom->het) between " + otherGenoTag + " and GT\">";
 
111
    variantFile.addHeaderLine(line);
 
112
 
 
113
    line = "##INFO=<ID=" + otherGenoTag
 
114
        + ".site.call_concordance,Number=1,Type=Integer,Description=\"Estimated call concorndance rate of "
 
115
        + otherGenoTag + " genotypes between " + otherGenoTag + " and GT\">";
 
116
    variantFile.addHeaderLine(line);
 
117
 
 
118
    line = "##INFO=<ID=" + otherGenoTag
 
119
        + ".site.non_reference_discrepancy,Number=1,Type=Float,Description=\"Estimated non-reference discrepancy relative to "
 
120
        + otherGenoTag + " genotypes,\">";
 
121
    variantFile.addHeaderLine(line);
 
122
 
 
123
    line = "##INFO=<ID=" + otherGenoTag
 
124
        + ".site.non_reference_discrepancy.count,Number=1,Type=Int,Description=\"non-reference discrepancy normalizer relative to "
 
125
        + otherGenoTag + " genotypes,\">";
 
126
    variantFile.addHeaderLine(line);
 
127
 
 
128
    line = "##INFO=<ID=" + otherGenoTag
 
129
        + ".site.non_reference_discrepancy.normalizer,Number=1,Type=Int,Description=\"non-reference discrepancy count relative to "
 
130
        + otherGenoTag + " genotypes,\">";
 
131
    variantFile.addHeaderLine(line);
 
132
 
 
133
    line = "##INFO=<ID=" + otherGenoTag
 
134
        + ".site.non_reference_sensitivity,Number=1,Type=Float,Description=\"Estimated non-reference sensitivity relative to "
 
135
        + otherGenoTag + " genotypes,\">";
 
136
    variantFile.addHeaderLine(line);
 
137
 
 
138
    line = "##INFO=<ID=" + otherGenoTag
 
139
        + ".site.non_reference_sensitivity.count,Number=1,Type=Int,Description=\"non-reference sensitivity normalizer relative to "
 
140
        + otherGenoTag + " genotypes,\">";
 
141
    variantFile.addHeaderLine(line);
 
142
 
 
143
    line = "##INFO=<ID=" + otherGenoTag
 
144
        + ".site.non_reference_sensitivity.normalizer,Number=1,Type=Int,Description=\"non-reference sensitivity count relative to "
 
145
        + otherGenoTag + " genotypes,\">";
 
146
    variantFile.addHeaderLine(line);
 
147
 
 
148
    cout << variantFile.header << endl;
 
149
 
 
150
    Variant var(variantFile);
 
151
 
 
152
    while (variantFile.getNextVariant(var)) {
 
153
 
 
154
        //cout << "next: " << var << endl;
 
155
        // for each sample, check GT against <other-genotype-tag>
 
156
        // tally stats, and append to info
 
157
        map<string, map<string, vector<string> > >::iterator s     = var.samples.begin();
 
158
        map<string, map<string, vector<string> > >::iterator sEnd  = var.samples.end();
 
159
 
 
160
        map<string, int> genotypeComparisonCounts;
 
161
        int gtCount = var.samples.size();
 
162
        int gtAltCount = 0; // number of alternate-containing genotypes in the first file
 
163
        int pdCount = 0; // positive discrepancy count
 
164
        int ndCount = 0; // negative discrepancy count
 
165
        int nnCount = 0; // null discrepancy count
 
166
        int cdCount = 0; // call discrepancy count
 
167
        int ccCount = 0; // call concordance count
 
168
        int nrdCount = 0; // non-reference discrepancy count
 
169
        int nrdNormalizer = 0; // divisor for nrd rate
 
170
        int nrsCount = 0; // non-reference sensitivity count
 
171
        int nrsNormalizer = 0; // divisor for nrs rate
 
172
 
 
173
        for (; s != sEnd; ++s) {
 
174
            map<string, vector<string> >& sample = s->second;
 
175
            const string& name = s->first;
 
176
 
 
177
            // decompose genotypes into counts of strings
 
178
            // to facilitate comparison
 
179
 
 
180
            string gtA;
 
181
            if (sample.find("GT") == sample.end()) {
 
182
                gtA = "./.";
 
183
            } else {
 
184
                gtA = sample["GT"].front();
 
185
            }
 
186
 
 
187
            string gtB;
 
188
            if (sample.find(otherGenoTag) == sample.end()) {
 
189
                gtB = "./.";
 
190
            } else {
 
191
                gtB = sample[otherGenoTag].front();
 
192
            }
 
193
 
 
194
 
 
195
            map<int, int> genotypeA = decomposeGenotype(gtA);
 
196
            map<int, int> genotypeB = decomposeGenotype(gtB);
 
197
 
 
198
            string gtspecA = genotypeSpec(genotypeA);
 
199
            string gtspecB = genotypeSpec(genotypeB);
 
200
            //cout << gtA << " " << gtB << endl;
 
201
            //cout << gtspecA << " " << gtspecB << endl;
 
202
            ++genotypeComparisonCounts[gtspecA + "_" + gtspecB];
 
203
 
 
204
            if (hasNonRef(genotypeA)) {
 
205
                ++gtAltCount;
 
206
            }
 
207
 
 
208
            if (genotypeA != genotypeB) {
 
209
                if (isNull(genotypeA)) {
 
210
                    // TODO handle this somehow, maybe via a different flag?
 
211
                    if (!isNull(genotypeB)) {
 
212
                        ++nnCount;  // null discrepancy, the second set makes a call, this one does not
 
213
                    }
 
214
                } else if (hasNonRef(genotypeA)) {
 
215
                    if (!isNull(genotypeB) && hasNonRef(genotypeB)) { // they cannot be the same, but they both represent an alternate
 
216
                        ++cdCount;  // the calls are discrepant
 
217
                    } else { // the other call does not have an alternate
 
218
                        ++pdCount;
 
219
                        // it is also null
 
220
                        if (isNull(genotypeB)) {
 
221
                            ++nnCount;
 
222
                        }
 
223
                    }
 
224
                } else { // the current genotype has no non-ref alternate
 
225
                    if (!isNull(genotypeB) && hasNonRef(genotypeB)) {
 
226
                        ++ndCount;
 
227
                    }
 
228
                    if (isNull(genotypeB)) {
 
229
                        ++nnCount;
 
230
                    }
 
231
                }
 
232
            } else {
 
233
                if (!isNull(genotypeA)) {
 
234
                    ++ccCount;
 
235
                }
 
236
            }
 
237
 
 
238
 
 
239
            if (!(isNull(genotypeA) || isNull(genotypeB))
 
240
                    && !(isHomRef(genotypeA) && isHomRef(genotypeB))) {
 
241
                ++nrdNormalizer;
 
242
                if (genotypeA != genotypeB) {
 
243
                    ++nrdCount;
 
244
                }
 
245
            }
 
246
 
 
247
            if (!(isNull(genotypeB) || isHomRef(genotypeB))) {
 
248
                ++nrsNormalizer;
 
249
                if (!(isNull(genotypeA) || isHomRef(genotypeA))) {
 
250
                    ++nrsCount;
 
251
                }
 
252
            }
 
253
 
 
254
        }
 
255
 
 
256
        for (map<string, int>::iterator g = genotypeComparisonCounts.begin();
 
257
                g != genotypeComparisonCounts.end(); ++g) {
 
258
            stringstream c;
 
259
            c << g->second;
 
260
            vector<string>& t = var.info[otherGenoTag + ".genotypes." + g->first];
 
261
            t.clear(); t.push_back(c.str());
 
262
        }
 
263
 
 
264
        stringstream gtc;
 
265
        gtc << gtCount;
 
266
        var.info[otherGenoTag + ".genotypes.count"].push_back(gtc.str());
 
267
 
 
268
        stringstream gtac;
 
269
        gtac << gtAltCount;
 
270
        var.info[otherGenoTag + ".genotypes.alternate_count"].push_back(gtac.str());
 
271
 
 
272
        stringstream pd;
 
273
        pd << pdCount;
 
274
        var.info[otherGenoTag + ".site.alternate_positive_discrepancy"].push_back(pd.str());
 
275
 
 
276
        stringstream nd;
 
277
        nd << ndCount;
 
278
        var.info[otherGenoTag + ".site.alternate_negative_discrepancy"].push_back(nd.str());
 
279
 
 
280
        stringstream nn;
 
281
        nn << nnCount;
 
282
        var.info[otherGenoTag + ".site.alternate_null_discrepancy"].push_back(nn.str());
 
283
 
 
284
        stringstream cd;
 
285
        cd << cdCount;
 
286
        var.info[otherGenoTag + ".site.call_discrepancy"].push_back(cd.str());
 
287
 
 
288
        stringstream cc;
 
289
        cc << ccCount;
 
290
        var.info[otherGenoTag + ".site.call_concordance"].push_back(cc.str());
 
291
 
 
292
        stringstream nrdc;
 
293
        nrdc << nrdCount;
 
294
        var.info[otherGenoTag + ".site.non_reference_discrepancy.count"].push_back(nrdc.str());
 
295
 
 
296
        stringstream nrdn;
 
297
        nrdn << nrdNormalizer;
 
298
        var.info[otherGenoTag + ".site.non_reference_discrepancy.normalizer"].push_back(nrdn.str());
 
299
 
 
300
        if (nrdNormalizer > 0) {
 
301
            stringstream nrd;
 
302
            nrd << (double) nrdCount / (double) nrdNormalizer;
 
303
            var.info[otherGenoTag + ".site.non_reference_discrepancy"].push_back(nrd.str());
 
304
        }
 
305
 
 
306
        stringstream nrsc;
 
307
        nrsc << nrsCount;
 
308
        var.info[otherGenoTag + ".site.non_reference_sensitivity.count"].push_back(nrsc.str());
 
309
 
 
310
        stringstream nrsn;
 
311
        nrsn << nrsNormalizer;
 
312
        var.info[otherGenoTag + ".site.non_reference_sensitivity.normalizer"].push_back(nrsn.str());
 
313
 
 
314
        if (nrsNormalizer > 0) {
 
315
            stringstream nrs;
 
316
            nrs << (double) nrsCount / (double) nrsNormalizer;
 
317
            var.info[otherGenoTag + ".site.non_reference_sensitivity"].push_back(nrs.str());
 
318
        }
 
319
 
 
320
        cout << var << endl;
 
321
 
 
322
    }
 
323
 
 
324
    return 0;
 
325
 
 
326
}
 
327