3
* @brief An iterative de Bruijn graph assembler for transcriptome sequencing reads.
4
* @author Yu Peng (ypeng@cs.hku.hk)
20
#include "assembly/assembly_utility.h"
21
#include "assembly/local_assembler.h"
22
#include "basic/bit_operation.h"
23
#include "basic/histgram.h"
24
#include "graph/contig_graph.h"
25
#include "graph/hash_graph.h"
26
#include "misc/hash_aligner.h"
28
#include "misc/options_description.h"
29
#include "misc/utils.h"
30
#include "sequence/sequence.h"
31
#include "sequence/sequence_io.h"
32
#include "sequence/short_sequence.h"
41
string long_read_file;
61
bool is_pre_correction;
85
is_no_coverage = false;
86
is_no_correct = false;
87
is_pre_correction = false;
91
{ return directory + "/log"; }
94
{ return directory + "/kmer"; }
96
string align_file(int kmer_size)
97
{ return directory + FormatString("/align-%d", kmer_size); }
99
string align_local_file(int kmer_size)
100
{ return directory + FormatString("/align-local-%d", kmer_size); }
102
string graph_file(int kmer_size)
103
{ return directory + FormatString("/graph-%d.fa", kmer_size); }
105
string contig_file(int kmer_size)
106
{ return directory + FormatString("/contig-%d.fa", kmer_size); }
108
string contig_info_file(int kmer_size)
109
{ return directory + FormatString("/contig-info-%d.fa", kmer_size); }
111
string local_contig_file(int kmer_size)
112
{ return directory + FormatString("/local-contig-%d.fa", kmer_size); }
115
{ return directory + "/contig.fa"; }
117
string scaffold_file()
118
{ return directory + "/scaffold.fa"; }
120
string ref_contig_file()
121
{ return directory + "/ref_contig.fa"; }
123
string transcript_file(int kmer_size)
124
{ return directory + FormatString("/transcript-%d.fa", kmer_size); }
126
string label_transcript_file(int kmer_size)
127
{ return directory + FormatString("/label-transcript-%d.fa", kmer_size); }
129
string component_file(int kmer_size)
130
{ return directory + FormatString("/component-%d", kmer_size); }
133
AssemblyInfo assembly_info;
138
int max_isoforms = 3;
139
int max_component_size = 30;
141
void BuildHashGraph(int kmer_size);
142
void Assemble(HashGraph &hash_graph);
143
void AlignReads(const string &contig_file, const string &align_file);
144
void CorrectReads(int kmer_size);
145
void LocalAssembly(int kmer_size, int new_kmer_size);
146
void Iterate(int kmer_size, int new_kmer_size);
147
void FindIsoforms(ContigGraph &contig_graph, deque<Sequence> &transcripts);
149
bool CompareCoverage(const ContigGraphVertexAdaptor &x, const ContigGraphVertexAdaptor &y)
151
return x.coverage() > y.coverage();
154
bool CompareCoveragePath(const ContigGraphPath &x, const ContigGraphPath &y)
156
return CompareCoverage(x.back(), y.back());
159
double Similarity(ContigGraphPath &x, ContigGraphPath &y)
161
deque<ContigGraphVertexAdaptor> a;
162
deque<ContigGraphVertexAdaptor> b;
164
for (unsigned i = 0; i < x.num_nodes(); ++i)
166
for (unsigned i = 0; i < y.num_nodes(); ++i)
169
sort(a.begin(), a.end());
170
sort(b.begin(), b.end());
175
while (i < a.size() && j < b.size())
179
else if (a[i] > b[j])
183
common += a[i].contig_size();
189
return common / max(x.size(), y.size());
192
void FindIsoforms(ContigGraphPath &path, deque<ContigGraphPath> &isoforms, ContigGraph &contig_graph, deque<ContigGraphPath> &component_isoforms)
194
if ((int)isoforms.size() >= max_isoforms)
197
int kmer_size = contig_graph.kmer_size();
199
ContigGraphVertexAdaptor current = path.back();
200
if (current.out_edges().size() != 0)
202
deque<ContigGraphVertexAdaptor> candidates;
203
for (int x = 0; x < 4; ++x)
205
if (current.out_edges()[x])
207
ContigGraphVertexAdaptor next = contig_graph.GetNeighbor(current, x);
209
if (next.status().IsUsed())
212
candidates.push_back(next);
216
sort(candidates.begin(), candidates.end(), CompareCoverage);
218
for (unsigned i = 0; i < candidates.size(); ++i)
220
ContigGraphVertexAdaptor next = candidates[i];
221
path.Append(next, - kmer_size + 1);
222
next.status().SetUsedFlag();
223
FindIsoforms(path, isoforms, contig_graph, component_isoforms);
224
next.status().ResetUsedFlag();
232
if (flag && path.size() > 300)
234
isoforms.push_back(path);
239
bool Compare(int x, int y)
241
deque<ShortSequence> &reads = assembly_info.reads;
242
if (reads[x] != reads[y])
243
return reads[x] < reads[y];
245
return reads[x+1] < reads[y+1];
248
void SortReads(AssemblyInfo &assembly_info)
250
deque<ShortSequence> &reads = assembly_info.reads;
252
for (unsigned i = 0; i < reads.size(); i += 2)
254
if (reads[i+1] < reads[i])
255
swap(reads[i], reads[i+1]);
259
sort(aux.begin(), aux.end(), Compare);
261
deque<ShortSequence> new_reads;
263
for (int i = 0; i < (int64_t)aux.size(); ++i)
266
if (last == -1 || reads[id] != reads[last] || reads[id+1] != reads[last+1])
268
new_reads.push_back(reads[id]);
269
new_reads.push_back(reads[id+1]);
275
reads.swap(new_reads);
278
int main(int argc, char *argv[])
280
OptionsDescription desc;
282
desc.AddOption("out", "o", option.directory, "output directory");
283
desc.AddOption("read", "r", option.read_file, FormatString("fasta read file (<=%d)", ShortSequence::max_size()));
284
desc.AddOption("long_read", "l", option.long_read_file, FormatString("fasta long read file (>%d)", ShortSequence::max_size()));
285
desc.AddOption("mink", "", option.mink, FormatString("minimum k value (<=%d)", Kmer::max_size()));
286
desc.AddOption("maxk", "", option.maxk, FormatString("maximum k value (<=%d)", Kmer::max_size()));
287
desc.AddOption("step", "", option.step, "increment of k-mer of each iteration");
288
desc.AddOption("inner_mink", "", option.inner_mink, "inner minimum k value");
289
desc.AddOption("inner_step", "", option.inner_step, "inner increment of k-mer");
290
desc.AddOption("prefix", "", option.prefix_length, "prefix length used to build sub k-mer table");
291
desc.AddOption("min_count", "", option.min_count, "minimum multiplicity for filtering k-mer when building the graph");
292
desc.AddOption("min_support", "", option.min_support, "minimum supoort in each iteration");
293
desc.AddOption("num_threads", "", option.num_threads, "number of threads");
294
desc.AddOption("seed_kmer", "", option.seed_kmer_size, "seed kmer size for alignment");
295
desc.AddOption("min_contig", "", option.min_contig, "minimum size of contig");
296
desc.AddOption("similar", "", option.similar, "similarity for alignment");
297
desc.AddOption("max_mismatch", "", option.max_mismatch, "max mismatch of error correction");
298
//desc.AddOption("min_pairs", "", option.min_pairs, "minimum number of pairs");
299
//desc.AddOption("max_gap", "", option.max_gap, "maximum gap in reference");
300
desc.AddOption("no_local", "", option.is_no_local, "do not use local assembly");
301
desc.AddOption("no_coverage", "", option.is_no_coverage, "do not iterate on coverage");
302
desc.AddOption("no_correct", "", option.is_no_correct, "do not do correction");
303
desc.AddOption("pre_correction", "", option.is_pre_correction, "perform pre-correction before assembly");
304
desc.AddOption("max_isoforms", "", max_isoforms, "maximum number of isoforms");
305
desc.AddOption("max_component_size", "", max_component_size, "maximum size of components");
309
desc.Parse(argc, argv);
311
if (option.read_file == "" && option.long_read_file == "")
312
throw logic_error("not enough parameters");
314
if (option.maxk < option.mink)
315
throw invalid_argument("mink is larger than maxk");
317
if (option.maxk > (int)Kmer::max_size())
318
throw invalid_argument("maxk is too large");
322
cerr << e.what() << endl;
323
cerr << "IDBA-Tran - Iterative de Bruijn Graph Assembler for next-generation transcriptome sequencing data." << endl;
324
cerr << "Usage: idba_tran -r read.fa -o output_dir" << endl;
325
cerr << "Allowed Options: " << endl;
326
cerr << desc << endl;
330
MakeDir(option.directory);
332
LogThread log_thread(option.log_file());
334
string begin_file = option.directory + "/begin";
335
fclose(OpenFile(begin_file, "wb"));
337
if (option.num_threads == 0)
338
option.num_threads = omp_get_max_threads();
340
omp_set_num_threads(option.num_threads);
341
cout << "number of threads " << option.num_threads << endl;
343
ReadInput(option.read_file, option.long_read_file, assembly_info);
344
cout << "reads " << assembly_info.reads.size() << endl;
345
cout << "long reads " << assembly_info.long_reads.size() << endl;
347
SortReads(assembly_info);
348
cout << "reads " << assembly_info.reads.size() << endl;
349
cout << "long reads " << assembly_info.long_reads.size() << endl;
351
read_length = assembly_info.read_length();
352
cout << "read_length " << read_length << endl;
354
if (option.is_pre_correction)
356
int kmer_size = (option.maxk + option.mink)/2;
357
cout << "kmer " << kmer_size << endl;
358
BuildHashGraph(kmer_size);
359
AlignReads(option.contig_file(kmer_size), option.align_file(kmer_size));
360
CorrectReads(kmer_size);
361
assembly_info.ClearStatus();
364
int old_kmer_size = 0;
365
int kmer_size = option.mink;
368
cout << "kmer " << kmer_size << endl;
370
if (kmer_size == option.mink)
371
BuildHashGraph(kmer_size);
373
Iterate(old_kmer_size, kmer_size);
375
assembly_info.ClearStatus();
376
AlignReads(option.contig_file(kmer_size), option.align_file(kmer_size));
378
CorrectReads(kmer_size);
379
assembly_info.ClearStatus();
381
old_kmer_size = kmer_size;
382
kmer_size = min(option.maxk, kmer_size + option.step);
383
LocalAssembly(old_kmer_size, kmer_size);
385
if (old_kmer_size == option.maxk)
389
kmer_size = option.maxk;
391
deque<Sequence> contigs;
393
ReadSequence(option.contig_file(kmer_size), contigs, names);
394
FastaWriter writer(option.contig_file());
395
for (unsigned i = 0; i < contigs.size(); ++i)
397
if ((int)contigs[i].size() >= option.min_contig)
398
writer.Write(contigs[i], names[i]);
401
string end_file = option.directory + "/end";
402
fclose(OpenFile(end_file, "wb"));
409
void BuildHashGraph(int kmer_size)
411
BuildKmerFile(assembly_info, kmer_size, option.min_count, option.prefix_length, option.kmer_file());
413
HashGraph hash_graph(kmer_size);
414
ReadKmerFile(option.kmer_file(), hash_graph);
416
hash_graph.RefreshEdges();
418
if (!option.is_no_internal)
419
InsertInternalKmers(assembly_info, hash_graph, option.min_count);
421
if (option.reference != "")
423
deque<Sequence> ref_contigs;
424
ReadSequence(option.ref_contig_file(), ref_contigs);
425
#pragma omp parallel for
426
for (int64_t i = 0; i < (int64_t)ref_contigs.size(); ++i)
427
hash_graph.InsertUncountKmers(ref_contigs[i]);
428
hash_graph.RefreshEdges();
431
Assemble(hash_graph);
434
void Assemble(HashGraph &hash_graph)
436
cout << "kmers " << hash_graph.num_vertices() << " "<< hash_graph.num_edges() << endl;
438
int kmer_size = hash_graph.kmer_size();
439
double min_cover = max(1, (kmer_size == option.mink ? option.min_count : option.min_support));
441
Histgram<int> hist = hash_graph.coverage_histgram();
442
//double expected_coverage = hist.mean();
444
deque<Sequence> contigs;
445
deque<ContigInfo> contig_infos;
446
hash_graph.Assemble(contigs, contig_infos);
450
HashGraph tmp_hash_graph;
451
tmp_hash_graph.swap(hash_graph);
454
ContigGraph contig_graph(kmer_size, contigs, contig_infos);
456
contig_infos.clear();
458
if (!option.is_no_coverage)
460
contig_graph.RemoveStandAlone(kmer_size);
462
int bubble = contig_graph.RemoveBubble();
463
cout << "merge bubble " << bubble << endl;
465
contig_graph.RemoveLocalLowCoverage(min_cover, option.min_contig, 0.1);
468
contig_graph.SortVertices();
469
contig_graph.GetContigs(contigs, contig_infos);
470
WriteSequence(option.graph_file(kmer_size), contigs);
472
contig_infos.clear();
474
if (!option.is_no_coverage)
478
deque<Sequence> multi_contigs;
479
deque<ContigInfo> multi_contig_infos;
480
contig_graph.GetContigs(multi_contigs, multi_contig_infos);
481
PrintN50(multi_contigs);
483
contig_graph.Trim(10);
484
contig_graph.MergeSimilarPath();
485
contig_graph.GetContigs(multi_contigs, multi_contig_infos);
487
contig_graph.InitializeTable();
488
contig_graph.IterateComponentCoverage2(option.min_contig, ratio, min_cover, 1e100, 1.1, max_component_size);
489
contig_graph.GetContigs(multi_contigs, multi_contig_infos);
491
contig_graph.Trim(10);
492
contig_graph.Prune(kmer_size);
493
contig_graph.GetContigs(multi_contigs, multi_contig_infos);
495
contig_graph.MergeSimilarPath();
498
deque<Sequence> multi_contigs;
499
deque<ContigInfo> multi_contig_infos;
500
contig_graph.SortVertices();
501
contig_graph.GetContigs(multi_contigs, multi_contig_infos);
502
PrintN50(multi_contigs);
503
WriteSequence(option.contig_file(kmer_size), multi_contigs);
504
WriteContigInfo(option.contig_info_file(kmer_size), multi_contig_infos);
506
deque<Sequence> transcripts;
508
FindIsoforms(contig_graph, transcripts);
511
for (unsigned i = 0; i < transcripts.size(); ++i)
513
if (transcripts[i].size() >= 300)
514
transcripts[index++] = transcripts[i];
516
transcripts.resize(index);
518
PrintN50(transcripts);
519
WriteSequence(option.transcript_file(kmer_size), transcripts, FormatString("transcript-%d", kmer_size));
522
void AlignReads(const string &contig_file, const string &align_file)
524
deque<Sequence> contigs;
525
ReadSequence(contig_file, contigs);
527
HashAligner hash_aligner(option.seed_kmer_size, option.min_contig, 2);
528
hash_aligner.Initialize(contigs);
530
int64_t num_aligned_reads = AlignReads(assembly_info, hash_aligner, option.similar, align_file, true);
531
cout << "aligned " << num_aligned_reads << " reads" << endl;
534
void CorrectReads(int kmer_size)
536
if (option.is_no_correct)
539
deque<Sequence> contigs;
541
deque<ContigInfo> contig_infos;
542
ReadSequence(option.contig_file(kmer_size), contigs, names);
543
CorrectReads(assembly_info, contigs, contig_infos, option.align_file(kmer_size), option.max_mismatch);
544
//WriteSequence(option.contig_file(kmer_size), contigs);
545
WriteContig(option.contig_file(kmer_size), contigs, contig_infos, FormatString("contig-%d", kmer_size));
548
void LocalAssembly(int kmer_size, int new_kmer_size)
550
EstimateDistance(option.align_file(kmer_size), median, sd);
551
if (median < 0 || median != median || sd != sd || sd > 2*median)
553
cout << "invalid insert distance" << endl;
554
deque<Sequence> local_contigs;
555
WriteSequence(option.local_contig_file(kmer_size), local_contigs, FormatString("local_contig_%d", kmer_size));
559
deque<ShortSequence> &reads = assembly_info.reads;
561
deque<Sequence> contigs;
562
ReadSequence(option.contig_file(kmer_size), contigs);
564
LocalAssembler local_assembler;
565
local_assembler.Initialize(assembly_info, contigs);
566
local_assembler.set_num_threads(option.num_threads);
567
local_assembler.set_mink(option.inner_mink);
568
local_assembler.set_maxk(new_kmer_size);
569
local_assembler.set_step(option.inner_step);
570
local_assembler.set_min_contig(option.min_contig);
571
local_assembler.set_insert_distance(median, sd);
573
FILE *falign = OpenFile(option.align_file(kmer_size), "rb");
574
int buffer_size = (1 << 20) * option.num_threads;
575
for (int64_t offset = 0; offset < (int64_t)reads.size(); offset += buffer_size)
577
int64_t size = min((int64_t)buffer_size, (int64_t)(reads.size() - offset));
578
vector<HashAlignerRecord> all_records(size);
580
ReadHashAlignerRecordBlock(falign, all_records);
581
#pragma omp parallel for
582
for (int i = 0; i < size; ++i)
584
HashAlignerRecord &record = all_records[i];
586
if (record.match_length != 0)
587
local_assembler.AddReadByHashAlignerRecord(record, offset + i);
592
deque<Sequence> local_contigs;
594
if (!option.is_no_local)
595
local_assembler.Assemble(local_contigs);
597
int num_seed_contigs = 0;
598
for (unsigned i = 0; i < contigs.size(); ++i)
600
if ((int)contigs[i].size() > option.min_contig)
604
cout << "seed contigs " << num_seed_contigs << " local contigs " << local_contigs.size() << endl;
605
WriteSequence(option.local_contig_file(kmer_size), local_contigs, FormatString("local_contig_%d", kmer_size));
608
void Iterate(int kmer_size, int new_kmer_size)
610
deque<Sequence> contigs;
611
ReadSequence(option.contig_file(kmer_size), contigs);
613
deque<Sequence> local_contigs;
614
ReadSequence(option.local_contig_file(kmer_size), local_contigs);
616
deque<Sequence> multi_contigs;
617
ReadSequence(option.graph_file(kmer_size), multi_contigs);
619
deque<Sequence> transcripts;
620
ReadSequence(option.transcript_file(kmer_size), transcripts);
623
for (unsigned i = 0; i < contigs.size(); ++i)
624
sum += contigs[i].size();
625
HashGraph hash_graph(kmer_size);
626
hash_graph.reserve(sum);
628
deque<Sequence> old_contigs;
629
old_contigs.insert(old_contigs.end(), contigs.begin(), contigs.end());
630
old_contigs.insert(old_contigs.end(), local_contigs.begin(), local_contigs.end());
631
old_contigs.insert(old_contigs.end(), multi_contigs.begin(), multi_contigs.end());
632
old_contigs.insert(old_contigs.end(), transcripts.begin(), transcripts.end());
634
local_contigs.clear();
635
multi_contigs.clear();
637
IterateHashGraph(assembly_info, new_kmer_size, option.min_support, hash_graph, old_contigs);
638
kmer_size = new_kmer_size;
641
if (kmer_size < read_length)
642
hash_graph.RefreshEdges();
644
hash_graph.AddAllEdges();
646
Assemble(hash_graph);
649
void FindIsoforms(ContigGraph &contig_graph, deque<Sequence> &transcripts)
653
deque<deque<ContigGraphVertexAdaptor> > components;
654
deque<string> component_strings;
655
contig_graph.GetComponents(components, component_strings);
658
for (unsigned i = 0; i < components.size(); ++i)
660
if ((int)components[i].size() < max_component_size)
661
++num[components[i].size()];
665
for (unsigned j = 0; j < components[i].size(); ++j)
666
total += components[i][j].contig_size();
670
int kmer_size = contig_graph.kmer_size();
671
int output_component = 0;
673
//FastaWriter transcript_writer(option.label_transcript_file(kmer_size));
674
FastaWriter component_writer(option.component_file(kmer_size));
675
for (unsigned i = 0; i < components.size(); ++i)
677
for (unsigned j = 0; j < components[i].size(); ++j)
679
Sequence seq = components[i][j].contig();
680
component_writer.Write(seq, FormatString("component_%d_%d", i, j));
684
//int tran_index = 0;
685
for (unsigned i = 0; i < components.size(); ++i)
689
for (unsigned j = 0; j < components[i].size(); ++j)
691
if (components[i][j].in_edges().size() == 0)
693
if (components[i][j].out_edges().size() == 0)
697
if (num_end < num_begin)
699
for (unsigned j = 0; j < components[i].size(); ++j)
700
components[i][j].ReverseComplement();
701
swap(num_begin, num_end);
704
if ((int)components[i].size() <= max_component_size)// * 100)
710
deque<ContigGraphPath> all_isoforms;
711
for (unsigned j = 0; j < components[i].size(); ++j)
713
if (components[i][j].in_edges().size() == 0)
715
ContigGraphPath path;
716
path.Append(components[i][j], 0);
717
components[i][j].status().SetUsedFlag();
718
deque<ContigGraphPath> isoforms;
719
FindIsoforms(path, isoforms, contig_graph, all_isoforms);
720
components[i][j].status().ResetUsedFlag();
722
all_isoforms.insert(all_isoforms.end(), isoforms.begin(), isoforms.end());
724
for (unsigned k = 0; k < isoforms.size(); ++k)
727
ContigInfo contig_info;
728
isoforms[k].Assemble(contig, contig_info);
730
transcripts.push_back(contig);
732
//transcript_writer.Write(contig, FormatString("transcript_%d_%d", i, tran_index++));
737
for (unsigned j = 0; j < all_isoforms.size(); ++j)
739
for (unsigned k = 0; k < all_isoforms[j].num_nodes(); ++k)
740
all_isoforms[j][k].status().SetUsedFlag();
743
for (unsigned j = 0; j < components[i].size(); ++j)
745
if (!components[i][j].status().IsUsed())
747
transcripts.push_back(components[i][j].contig());
749
//transcript_writer.Write(components[i][j].contig(), FormatString("transcript_%d_%d", i, tran_index++));
755
for (unsigned j = 0; j < components[i].size(); ++j)
757
transcripts.push_back(components[i][j].contig());
759
//transcript_writer.Write(components[i][j].contig(), FormatString("transcript_%d_%d", i, tran_index++));
764
contig_graph.ClearStatus();