3
#include "IntervalTree.h"
11
using namespace vcflib;
14
void printSummary(char** argv) {
15
cerr << "usage: " << argv[0] << " [options] [<vcf file>]" << endl
18
<< " -w, --ref-window-size N align using this many bases flanking each side of the reference allele" << endl
19
<< " -s, --alt-window-size N align using this many flanking bases from the reference around each alternate allele" << endl
20
<< " -r, --reference FILE FASTA reference file, required with -i and -u" << endl
21
<< " -m, --match-score N match score for SW algorithm" << endl
22
<< " -x, --mismatch-score N mismatch score for SW algorithm" << endl
23
<< " -o, --gap-open-penalty N gap open penalty for SW algorithm" << endl
24
<< " -e, --gap-extend-penalty N gap extension penalty for SW algorithm" << endl
25
<< " -z, --entropy-gap-open use entropy scaling for the gap open penalty" << endl
26
<< " -R, --repeat-gap-extend N penalize non-repeat-unit gaps in repeat sequence" << endl
27
<< " -a, --adjust-vcf TAG supply a new cigar as TAG in the output VCF" << endl
29
<< "For each alternate allele, attempt to realign against the reference with lowered gap open penalty." << endl
30
<< "If realignment is possible, adjust the cigar and reference/alternate alleles." << endl;
34
int main(int argc, char** argv) {
39
bool includePreviousBaseForIndels = false;
41
int altwindowsize = 50;
43
// constants for SmithWaterman algorithm
44
float matchScore = 10.0f;
45
float mismatchScore = -9.0f;
46
float gapOpenPenalty = 15.0f;
47
float gapExtendPenalty = 6.66f;
49
bool useEntropy = false;
50
bool useRepeatGapExtendPenalty = false;
51
float repeatGapExtendPenalty = 1;
53
bool adjustVcf = false;
54
string adjustedTag = "remappedCIGAR";
61
static struct option long_options[] =
63
/* These options set a flag. */
64
//{"verbose", no_argument, &verbose_flag, 1},
65
{"help", no_argument, 0, 'h'},
66
{"ref-window-size", required_argument, 0, 'w'},
67
{"reference", required_argument, 0, 'r'},
68
{"match-score", required_argument, 0, 'm'},
69
{"mismatch-score", required_argument, 0, 'x'},
70
{"gap-open-penalty", required_argument, 0, 'o'},
71
{"gap-extend-penalty", required_argument, 0, 'e'},
72
{"alt-window-size", required_argument, 0, 's'},
73
{"entropy-gap-open", no_argument, 0, 'z'},
74
{"repeat-gap-extend", no_argument, 0, 'R'},
75
{"adjust-vcf", required_argument, 0, 'a'},
78
/* getopt_long stores the option index here. */
81
c = getopt_long (argc, argv, "hza:w:r:m:x:o:e:s:R:",
82
long_options, &option_index);
90
windowsize = atoi(optarg);
99
fastaFileName = string(optarg);
107
matchScore = atof(optarg);
111
mismatchScore = atof(optarg);
115
gapOpenPenalty = atof(optarg);
119
gapExtendPenalty = atof(optarg);
123
altwindowsize = atoi(optarg);
131
useRepeatGapExtendPenalty = true;
132
repeatGapExtendPenalty = atof(optarg);
145
VariantCallFile variantFile;
146
string inputFilename;
147
if (optind == argc - 1) {
148
inputFilename = argv[optind];
149
variantFile.open(inputFilename);
151
variantFile.open(std::cin);
154
if (!variantFile.is_open()) {
155
cerr << "could not open VCF file" << endl;
159
FastaReference freference;
160
if (fastaFileName.empty()) {
161
cerr << "a reference is required" << endl;
164
freference.open(fastaFileName);
168
vector<string> commandline;
169
for (int i = 0; i < argc; ++i)
170
commandline.push_back(argv[i]);
171
variantFile.addHeaderLine("##INFO=<ID=" + adjustedTag + ",Number=A,Type=String,Description=\"CIGAR when remapped using"+ join(commandline, " ") +"\">");
174
cout << variantFile.header << endl;
176
Variant var(variantFile);
177
while (variantFile.getNextVariant(var)) {
182
map<string, vector<VariantAllele> > variantAlleles;
183
vector<vector<pair<int, char> > > cigars;
184
vector<int> positionDiffs;
185
for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
186
//if (!adjustVcf) cout << endl;
189
// try to remap locally
191
string reference = freference.getSubSequence(var.sequenceName, var.position - 1 - windowsize, windowsize * 2 + var.ref.size());
193
// passed to sw align
194
unsigned int referencePos;
197
string& alternate = *a;
199
vector<VariantAllele>& variants = variantAlleles[alternate];
201
string alternateQuery = reference.substr(windowsize - altwindowsize, altwindowsize) + alternate + reference.substr(reference.size() - windowsize, altwindowsize);
203
//cout << "REF:\t" << reference << endl;
204
//cout << "ALT:\t" << string(windowsize - altwindowsize, ' ') << alternateQuery << endl;
206
CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty);
207
if (useEntropy) sw.EnableEntropyGapPenalty(1);
208
if (useRepeatGapExtendPenalty) sw.EnableRepeatGapExtensionPenalty(repeatGapExtendPenalty);
209
sw.Align(referencePos, cigar, reference, alternateQuery);
215
vector<pair<int, char> > cigarData;
217
string ref = reference.substr(referencePos);
218
positionDiffs.push_back(referencePos); // TODO this... is borked
223
if (!adjustVcf) cout << cigar << endl;
224
cout << cigar << endl;
225
for (string::iterator c = cigar.begin(); c != cigar.end(); ++c) {
228
len = atoi(slen.c_str());
230
if (altpos < altwindowsize) {
231
cigarData.push_back(make_pair(len, 'M'));
233
cigarData.push_back(make_pair(len, *c));
235
altss << alternateQuery.substr(altpos, len);
236
refss << string(len, '-');
240
len = atoi(slen.c_str());
242
if (altpos < altwindowsize) {
244
cigarData.push_back(make_pair(len, *c));
246
refss << ref.substr(refpos, len);
247
altss << string(len, '-');
251
len = atoi(slen.c_str());
254
for (int i = 0; i < len; ++i) {
255
if (ref.at(refpos + i) == alternateQuery.at(altpos + i)) {
256
if (!cigarData.empty() && cigarData.back().second == 'M') {
257
cigarData.back().first++;
259
cigarData.push_back(make_pair(1, 'M'));
262
if (!cigarData.empty() && cigarData.back().second == 'X') {
263
cigarData.back().first++;
265
cigarData.push_back(make_pair(1, 'X'));
270
refss << ref.substr(refpos, len);
271
altss << alternateQuery.substr(altpos, len);
276
len = atoi(slen.c_str());
278
cigarData.push_back(make_pair(len, *c));
279
refss << ref.substr(refpos, len);
280
//altss << alternateQuery.substr(altpos, len); // TODO deal with soft clipping, weird behavior
292
cout << "ref:\t" << refss.str() << endl;
293
cout << "alt:\t" << altss.str() << endl;
295
cout << "ref:\t" << refss.str() << endl;
296
cout << "alt:\t" << altss.str() << endl;
297
cigars.push_back(cigarData);
303
int substart = cigars.front().front().first;
304
int subend = cigars.front().back().first;
306
// find the min and max match
307
for (vector<vector<pair<int, char> > >::iterator c = cigars.begin(); c != cigars.end(); ++c) {
308
if (c->front().second == 'M' && c->front().first <= substart) {
309
substart = c->front().first;
310
if (c->size() > 1 && c->at(1).second != 'X') {
314
if (c->back().second == 'M' && c->back().first <= subend) {
315
subend = c->back().first;
319
// adjust the cigars and get the new reference length
321
for (vector<vector<pair<int, char> > >::iterator c = cigars.begin(); c != cigars.end(); ++c) {
322
c->front().first -= substart;
323
c->back().first -= subend;
324
int crf = cigarRefLen(*c);
327
var.info[adjustedTag].push_back(joinCigar(*c));
330
// find the lowest positional difference
332
for (vector<int>::iterator d = positionDiffs.begin(); d != positionDiffs.end(); ++d) {
333
if (*d + altwindowsize < pdiff)
334
pdiff = *d + altwindowsize;
337
// adjust the reference string
338
var.position += pdiff;
340
// adjust the variant position
341
var.ref = freference.getSubSequence(var.sequenceName, var.position - 1, reflen);