44
48
" -k, --kmer=KMER_SIZE k-mer size\n"
45
49
" -o, --out=FILE output the merged contigs to FILE [stdout]\n"
46
" -p, --path=PATH_FILE paths output by SimpleGraph\n"
50
" -g, --graph=FILE write the contig overlap graph to FILE\n"
47
51
" --merged output only merged contigs\n"
52
" --adj output the graph in adj format\n"
53
" --dot output the graph in dot format [default]\n"
54
" --dot-meancov same as above but give the mean coverage\n"
55
" --sam output the graph in SAM format\n"
48
56
" -v, --verbose display verbose output\n"
49
57
" --help display this help and exit\n"
50
58
" --version output version information and exit\n"
66
81
static float minIdentity = 0.9;
69
static const char shortopts[] = "k:o:p:v";
84
static const char shortopts[] = "g:k:o:v";
71
86
enum { OPT_HELP = 1, OPT_VERSION };
73
88
static const struct option longopts[] = {
89
{ "adj", no_argument, &opt::format, ADJ },
90
{ "dot", no_argument, &opt::format, DOT },
91
{ "dot-meancov", no_argument, &opt::format, DOT_MEANCOV },
92
{ "sam", no_argument, &opt::format, SAM },
93
{ "graph", required_argument, NULL, 'g' },
74
94
{ "kmer", required_argument, NULL, 'k' },
75
95
{ "merged", no_argument, &opt::onlyMerged, 1 },
76
96
{ "out", required_argument, NULL, 'o' },
84
104
/* A contig sequence. */
85
typedef FastaRecord Contig;
106
Contig(const string& comment, const string& seq)
107
: comment(comment), seq(seq) { }
108
Contig(const FastaRecord& o) : comment(o.comment), seq(o.seq) { }
87
113
/** The contig sequences. */
88
static vector<Contig> g_contigs;
114
typedef vector<Contig> Contigs;
90
116
/** Return the sequence of the specified contig node. The sequence
91
117
* may be ambiguous or reverse complemented.
93
static Sequence sequence(const ContigNode& id)
119
static Sequence sequence(const Contigs& contigs, const ContigNode& id)
95
121
if (id.ambiguous()) {
96
122
string s(id.ambiguousSequence());
132
158
typedef ContigGraph<DirectedGraph<ContigProperties, Distance> > Graph;
133
159
typedef graph_traits<Graph>::vertex_descriptor vertex_descriptor;
134
typedef ContigPath Path;
161
/** Return the properties of the specified vertex, unless u is
162
* ambiguous, in which case return the length of the ambiguous
166
ContigProperties get(vertex_bundle_t, const Graph& g, ContigNode u)
169
? ContigProperties(u.length() + opt::k - 1, 0)
136
173
/** Append the sequence of contig v to seq. */
137
static void mergeContigs(const Graph& g,
174
static void mergeContigs(const Graph& g, const Contigs& contigs,
138
175
vertex_descriptor u, vertex_descriptor v,
139
Sequence& seq, const Path& path)
176
Sequence& seq, const ContigPath& path)
141
178
int d = get(edge_bundle, g, u, v).distance;
143
180
unsigned overlap = -d;
144
const Sequence& s = sequence(v);
181
const Sequence& s = sequence(contigs, v);
145
182
assert(s.length() > overlap);
147
184
Sequence bo(s, 0, overlap);
197
static Contig mergePath(const Graph& g, const Path& path)
234
/** Return a FASTA comment for the specified path. */
235
static void pathToComment(ostream& out, const ContigPath& path)
237
assert(path.size() > 1);
239
if (path.size() == 3)
240
out << ',' << path[1];
241
else if (path.size() > 3)
243
out << ',' << path.back();
246
/** Merge the specified path. */
247
static Contig mergePath(const Graph& g, const Contigs& contigs,
248
const ContigPath& path)
200
251
unsigned coverage = 0;
201
for (Path::const_iterator it = path.begin();
252
for (ContigPath::const_iterator it = path.begin();
202
253
it != path.end(); ++it) {
203
254
if (!it->ambiguous())
204
255
coverage += g[*it].coverage;
205
256
if (seq.empty()) {
257
seq = sequence(contigs, *it);
208
259
assert(it != path.begin());
209
mergeContigs(g, *(it-1), *it, seq, path);
260
mergeContigs(g, contigs, *(it-1), *it, seq, path);
212
263
ostringstream ss;
213
ss << seq.size() << ' ' << coverage;
214
return Contig("", ss.str(), seq);
264
ss << seq.size() << ' ' << coverage << ' ';
265
pathToComment(ss, path);
266
return Contig(ss.str(), seq);
217
/** Return a FASTA comment for the specified path. */
218
static inline string pathToComment(const ContigPath& path)
220
assert(path.size() > 1);
223
if (path.size() == 3)
225
else if (path.size() > 3)
227
s << ',' << path.back();
269
/** A container of ContigPath. */
270
typedef vector<ContigPath> ContigPaths;
231
272
/** Read contig paths from the specified file.
232
273
* @param ids [out] the string ID of the paths
234
static vector<Path> readPaths(const string& inPath,
275
static ContigPaths readPaths(const string& inPath,
235
276
vector<string>* ids = NULL)
243
284
assert_good(fin, inPath);
244
285
istream& in = inPath == "-" ? cin : fin;
249
291
while (in >> id >> path) {
250
292
paths.push_back(path);
252
294
ids->push_back(id);
297
if (opt::verbose > 1 && count % 1000000 == 0)
298
cerr << "Read " << count << " paths. "
299
"Using " << toSI(getMemoryUsage())
302
if (opt::verbose > 0)
303
cerr << "Read " << count << " paths. "
304
"Using " << toSI(getMemoryUsage()) << "B of memory.\n";
254
305
assert(in.eof());
258
309
/** Finds all contigs used in each path in paths, and
259
310
* marks them as seen in the vector seen. */
260
static void seenContigs(vector<bool>& seen, const vector<Path>& paths)
311
static void seenContigs(vector<bool>& seen, const ContigPaths& paths)
262
for (vector<Path>::const_iterator it = paths.begin();
313
for (ContigPaths::const_iterator it = paths.begin();
263
314
it != paths.end(); ++it)
264
for (Path::const_iterator itc = it->begin();
315
for (ContigPath::const_iterator itc = it->begin();
265
316
itc != it->end(); ++itc)
266
317
if (itc->id() < seen.size())
267
318
seen[itc->id()] = true;
271
322
* should be removed.
273
324
static void markRemovedContigs(vector<bool>& marked,
274
const vector<string>& pathIDs, const vector<Path>& paths)
325
const vector<string>& pathIDs, const ContigPaths& paths)
276
for (vector<Path>::const_iterator it = paths.begin();
327
for (ContigPaths::const_iterator it = paths.begin();
277
328
it != paths.end(); ++it)
279
330
marked[ContigID(pathIDs[it - paths.begin()])] = true;
333
/** Output the updated overlap graph. */
334
static void outputGraph(Graph& g,
335
const vector<string>& pathIDs, const ContigPaths& paths,
336
const string& commandLine)
338
// Add the path vertices.
339
for (ContigPaths::const_iterator it = paths.begin();
340
it != paths.end(); ++it) {
341
const ContigPath& path = *it;
342
const string& id = pathIDs[it - paths.begin()];
344
ContigID::insert(id);
345
merge(g, path.begin(), path.end());
349
// Remove the vertices that are used in paths.
350
for (ContigPaths::const_iterator it = paths.begin();
351
it != paths.end(); ++it) {
352
const ContigPath& path = *it;
353
const string& id = pathIDs[it - paths.begin()];
355
remove_vertex(ContigNode(id, false), g);
357
remove_vertex_if(g, path.begin(), path.end(),
358
not1(std::mem_fun_ref(&ContigNode::ambiguous)));
363
const string& graphPath = opt::graphPath;
364
assert(!graphPath.empty());
365
if (opt::verbose > 0)
366
cerr << "Writing `" << graphPath << "'..." << endl;
367
ofstream fout(graphPath.c_str());
368
assert_good(fout, graphPath);
369
write_graph(fout, g, PROGRAM, commandLine);
370
assert_good(fout, graphPath);
371
if (opt::verbose > 0)
372
printGraphStats(cerr, g);
282
375
int main(int argc, char** argv)
284
377
opt::trimMasked = false;
382
char** last = argv + argc - 1;
383
copy(argv, last, ostream_iterator<const char *>(ss, " "));
385
commandLine = ss.str();
286
388
bool die = false;
287
389
for (int c; (c = getopt_long(argc, argv,
288
390
shortopts, longopts, NULL)) != -1;) {
289
391
istringstream arg(optarg != NULL ? optarg : "");
291
393
case '?': die = true; break;
394
case 'g': arg >> opt::graphPath; assert(arg.eof()); break;
292
395
case 'k': arg >> opt::k; break;
293
396
case 'o': arg >> opt::out; break;
294
case 'p': arg >> opt::path; break;
295
397
case 'v': opt::verbose++; break;
297
399
cout << USAGE_MESSAGE;
326
433
const char* contigFile = argv[optind++];
327
434
string adjPath, mergedPathFile;
328
if (argc - optind > 1)
436
if (argc - optind > 1) {
329
437
adjPath = string(argv[optind++]);
439
// Read the contig adjacency graph.
440
if (opt::verbose > 0)
441
cerr << "Reading `" << adjPath << "'..." << endl;
442
ifstream fin(adjPath.c_str());
443
assert_good(fin, adjPath);
446
if (opt::verbose > 0)
447
cerr << "Read " << num_vertices(g) << " vertices. "
448
"Using " << toSI(getMemoryUsage()) << "B of memory.\n";
330
450
mergedPathFile = string(argv[optind++]);
332
452
// Read the contig sequence.
333
vector<Contig>& contigs = g_contigs;
455
if (opt::verbose > 0)
456
cerr << "Reading `" << contigFile << "'..." << endl;
335
458
FastaReader in(contigFile, FastaReader::NO_FOLD_CASE);
336
459
for (FastaRecord rec; in >> rec;) {
337
ContigID::insert(rec.id);
461
&& ContigID::count(rec.id) == 0)
463
ContigID id = adjPath.empty()
464
? ContigID::insert(rec.id)
466
assert(id == contigs.size());
338
467
contigs.push_back(rec);
470
if (opt::verbose > 1 && count % 1000000 == 0)
471
cerr << "Read " << count << " sequences. "
472
"Using " << toSI(getMemoryUsage())
475
if (opt::verbose > 0)
476
cerr << "Read " << count << " sequences. "
477
"Using " << toSI(getMemoryUsage())
340
479
assert(in.eof());
341
480
assert(!contigs.empty());
342
481
opt::colourSpace = isdigit(contigs[0].seq[0]);
347
485
vector<string> pathIDs;
348
vector<Path> paths = readPaths(mergedPathFile, &pathIDs);
349
if (opt::verbose > 0)
350
cerr << "Number of paths: " << paths.size() << '\n';
486
ContigPaths paths = readPaths(mergedPathFile, &pathIDs);
352
488
// Record all the contigs that are in a path.
353
489
vector<bool> seen(contigs.size());
354
490
seenContigs(seen, paths);
355
491
markRemovedContigs(seen, pathIDs, paths);
357
// Record all the contigs that were in a previous path.
358
if (argc - optind > 0) {
360
for (; optind < argc; optind++) {
361
vector<Path> prevPaths = readPaths(argv[optind]);
362
seenContigs(seen, prevPaths);
363
count += prevPaths.size();
365
if (opt::verbose > 0)
366
cerr << "Number of previous paths: " << count << '\n';
369
// Record all the contigs that are seeds.
370
if (!opt::path.empty()) {
371
vector<bool> seenPivots(contigs.size());
372
ifstream fin(opt::path.c_str());
373
assert_good(fin, opt::path);
374
for (ContigID id; fin >> id;) {
375
fin.ignore(numeric_limits<streamsize>::max(), '\n');
376
assert(id < contigs.size());
377
// Only count a pivot as seen if it was in a final path.
379
seenPivots[id] = true;
385
493
// Output those contigs that were not seen in a path.
387
495
ostream& out = opt::out == "-" ? cout
388
496
: (fout.open(opt::out.c_str()), fout);
389
497
assert_good(out, opt::out);
390
498
if (!opt::onlyMerged) {
391
for (vector<Contig>::const_iterator it = contigs.begin();
392
it != contigs.end(); ++it)
393
if (!seen[it - contigs.begin()])
499
for (Contigs::const_iterator it = contigs.begin();
500
it != contigs.end(); ++it) {
501
ContigID id(it - contigs.begin());
503
const Contig& contig = *it;
505
if (!contig.comment.empty())
506
out << ' ' << contig.comment;
507
out << '\n' << contig.seq << '\n';
397
512
if (adjPath.empty())
400
// Read the contig adjacency graph.
401
ifstream fin(adjPath.c_str());
402
assert_good(fin, adjPath);
407
515
unsigned npaths = 0;
408
for (vector<Path>::const_iterator it = paths.begin();
516
for (ContigPaths::const_iterator it = paths.begin();
409
517
it != paths.end(); ++it) {
410
const Path& path = *it;
518
const ContigPath& path = *it;
411
519
if (path.empty())
413
Contig contig = mergePath(g, path);
414
contig.id = pathIDs[it - paths.begin()];
415
FastaRecord rec(contig);
416
rec.comment += ' ' + pathToComment(path);
521
Contig contig = mergePath(g, contigs, path);
522
out << '>' << pathIDs[it - paths.begin()]
523
<< ' ' << contig.comment << '\n'
524
<< contig.seq << '\n';
418
525
assert_good(out, opt::out);