2
* @file validate_reads_blat.cpp
3
* @brief Use blat alignment result to validate reads.
4
* @author Yu Peng (ypeng@cs.hku.hk)
18
#include "misc/blat_record.h"
19
#include "misc/options_description.h"
20
#include "misc/utils.h"
21
#include "sequence/sequence.h"
22
#include "sequence/sequence_io.h"
26
const int MaxLine = (1 << 20);
31
int main(int argc, char *argv[])
34
double similar = 0.95;
35
double complete_rate = 0.8;
36
bool is_local = false;
38
OptionsDescription desc;
39
desc.AddOption("min_contig", "", min_contig, "minimum contigs");
40
desc.AddOption("similar", "", similar, "similarity");
41
desc.AddOption("complete_rate", "", complete_rate, "completeness");
42
desc.AddOption("is_local", "", is_local, "local align");
46
desc.Parse(argc, argv);
50
cerr << e.what() << endl;
51
cerr << "validate_contigs_blat - validate contigs by blat." << endl;
52
cerr << "Usage: validate_contigs_blat ref.fa contigs.fa." << endl;
53
cerr << "Allowed Options: " << endl;
59
deque<string> ref_names;
60
ReadSequence(argv[1], refs, ref_names);
62
deque<Sequence> contigs;
63
deque<string> contig_names;
64
ReadSequence(argv[2], contigs, contig_names);
66
vector<int> is_found(refs.size());
67
vector<vector<double> > flags(refs.size());
68
map<string, int> dict;
69
for (unsigned i = 0; i < refs.size(); ++i)
71
flags[i].resize(refs[i].size(), false);
72
size_t index = ref_names[i].find(' ');
73
if (index != string::npos)
74
ref_names[i].resize(index);
75
dict[ref_names[i]] = i;
79
for (unsigned i = 0; i < contigs.size(); ++i)
81
size_t index = contig_names[i].find(' ');
82
if (index != string::npos)
83
contig_names[i].resize(index);
85
bool is_new_gap = true;
86
for (unsigned j = 0; j < contigs[i].size(); ++j)
88
if (contigs[i][j] == 4)
101
string blat_file = string(argv[2]) + ".blat";
102
FILE *fblat = OpenFile(blat_file, "rb");
104
map<string, int> valid_contigs;
105
deque<int> valid_lengths;
106
int64_t num_mismatch = 0;
107
while (fgets(line, MaxLine, fblat) != NULL)
112
deque<BlatRecord> records;
113
records.push_back(record);
115
while (fgets(line, MaxLine, fblat) != NULL)
118
if (record.query_name == records.back().query_name)
119
records.push_back(record);
122
fseek(fblat, -strlen(line), SEEK_CUR);
128
for (unsigned i = 0; i < records.size(); ++i)
130
if (records[i].match_count > similar * records[i].query_length
131
&& records[i].match_count > similar * abs(record.ref_to - record.ref_from))
132
records[index++] = records[i];
134
records.resize(index);
136
for (unsigned i = 0; i < records.size(); ++i)
139
int ref_id = dict[record.ref_name];
141
//if (record.match_count > similar * record.query_length && record.query_length >= min_contig
142
if ((record.match_count > similar * record.query_length
143
|| (is_local && record.match_count > similar * abs(record.query_to - record.query_from)))
144
//if (record.match_count > similar * abs(record.query_to - record.query_from)
145
&& abs(record.query_to - record.query_from) >= min_contig
146
&& record.match_count > similar * abs(record.ref_to - record.ref_from)
149
//if (record.match_count >= similar * record.ref_length)
150
if (record.match_count >= complete_rate * record.ref_length)
151
is_found[ref_id] = true;
156
for (unsigned i = 0; i < record.blocks.size(); ++i)
158
BlatBlock block = record.blocks[i];
159
for (unsigned j = block.ref_from; j < block.ref_from + block.size; ++j)
161
if (flags[ref_id][j] == false)
163
//flags[ref_id][j] = true;
167
flags[ref_id][j] += 1.0 / records.size();
171
if (valid_contigs.find(record.query_name) == valid_contigs.end())
173
valid_contigs[record.query_name] = record.mismatch_count;
174
valid_lengths.push_back(record.query_to - record.query_from);
178
valid_contigs[record.query_name] = min(record.mismatch_count, (int64_t)valid_contigs[record.query_name]);
180
if (not_used > similar * record.query_length)
181
valid_lengths.push_back(record.query_to - record.query_from);
187
for (map<string, int>::iterator p = valid_contigs.begin(); p != valid_contigs.end(); ++p)
189
num_mismatch += p->second;
194
for (unsigned k = 0; k < flags.size(); ++k)
196
for (unsigned i = 0; i < flags[k].size(); ++i)
204
//valid_lengths.push_back(60000);
205
sort(valid_lengths.begin(), valid_lengths.end());
206
reverse(valid_lengths.begin(), valid_lengths.end());
212
for (unsigned i = 0; i < valid_lengths.size(); ++i)
214
sum += valid_lengths[i];
215
if (sum >= 0.5 * total && n50 == 0)
216
n50 = valid_lengths[i];
217
if (sum >= 0.8 * total && n80 == 0)
218
n80 = valid_lengths[i];
220
cout << "total " << total << " " << sum << endl;
222
long long maximum = 0;
224
if (valid_lengths.size() > 0)
226
maximum = valid_lengths[0];
227
mean = sum / valid_lengths.size();
230
long long sum_wrong = 0;
231
long long num_wrong = 0;
232
long long corret_contigs = 0;
233
long long sum_corret = 0;
236
deque<int> contig_flags(contigs.size(), false);
237
FastaWriter error_writer(FormatString("%s.error.fa", argv[2]));
238
for (unsigned i = 0; i < contigs.size(); ++i)
240
if ((int)contigs[i].size() < min_contig)
243
if (valid_contigs.find(contig_names[i]) == valid_contigs.end())
246
sum_wrong += contigs[i].size();
247
error_writer.Write(contigs[i], contig_names[i]);
252
last_error = sum_wrong;
255
sum_corret += contigs[i].size();
256
contig_flags[i] = true;
257
//correct_writer.Write(contigs[i], contig_names[i]);
261
printf("last id %d %d total contigs %d gaps %d\n", last_id, last_error, (int)(num_wrong + corret_contigs), num_gaps);
262
printf("contigs: %lld N50: %lld coverage: %.2f%% max: %lld mean: %lld total: %lld/%lld N80: %lld\n",
263
(long long)valid_contigs.size(), n50, count * 100.0 / total, maximum, mean, count, total, n80);
264
printf("substitution error: %.4f%% wrong contigs: %lld %lld correct: %lld %lld %s\n",
265
num_mismatch * 100.0 /sum, num_wrong, sum_wrong, corret_contigs, sum_corret, argv[2]);
268
for (unsigned i = 0; i < refs.size(); ++i)
271
for (unsigned j = 0; j < refs[i].size(); ++j)
273
if (flags[i][j] == 0)
277
lengths.push_back(j - last);
283
if (flags[i][last] == 0)
288
sort(lengths.begin(), lengths.end());
289
reverse(lengths.begin(), lengths.end());
291
deque<Sequence> gaps;
292
deque<bool> is_no_long_gaps(refs.size());
293
for (unsigned i = 0; i < refs.size(); ++i)
301
for (unsigned j = 0; j < refs[i].size(); ++j)
303
if (flags[i][j] == false)
305
gap.Append(refs[i][j]);
312
tmp.push_back(gap.size());
321
tmp.push_back(gap.size());
326
is_no_long_gaps[i] = true;
327
for (unsigned j = 1; j+1 < tmp.size(); ++j)
330
is_no_long_gaps[i] = false;
333
WriteSequence(FormatString("%s.gap.fa", argv[2]), gaps, "gap");
335
FastaWriter ref_writer(argv[1] + string(".found.fa"));
336
FILE *ffcound_list = OpenFile(argv[1] + string(".found.fa.list"), "wb");
339
int total_contigs = 0;
340
for (unsigned i = 0; i < refs.size(); ++i)
343
double total_hit = 0;
344
for (unsigned j = 0; j < flags[i].size(); ++j)
348
total_hit += flags[i][j];
351
if (count > complete_rate * refs[i].size()
352
//&& is_no_long_gaps[i]
353
//&& 1.0 * total_hit / count > 2
357
ref_writer.Write(refs[i], ref_names[i]);
358
fprintf(ffcound_list, "%s %.4f\n", ref_names[i].c_str(), 1.0 * total_hit / count);
364
int64_t total_bases = 0;
365
for (unsigned i = 0; i < contigs.size(); ++i)
367
if ((int)contigs[i].size() >= min_contig)
370
total_bases += contigs[i].size();
374
cout << corret_contigs << " " << total_contigs << " " << total_bases << endl;
375
printf("cover ref: %d %d\n", covered, (int)refs.size());
376
printf("found ref: %d %d\n", found, (int)refs.size());
377
printf("precision: %.2f%% %d %d\n", 100.0 * corret_contigs / total_contigs, (int)corret_contigs, total_contigs);
379
FastaWriter correct_writer(FormatString("%s.correct.fa", argv[2]));
380
for (unsigned i = 0; i < contigs.size(); i += 2)
382
if (contig_flags[i] || contig_flags[i+1])
384
correct_writer.Write(contigs[i], contig_names[i]);
385
correct_writer.Write(contigs[i+1], contig_names[i+1]);