42
36
#include "io/phd.H"
43
37
#include "io/ncbiinfoxml.H"
39
#include "util/fileanddisk.H"
40
#include "util/progressindic.H"
45
44
#include "mira/gbf_parse.H"
46
45
#include "mira/gff_parse.H"
47
46
#include "mira/maf_parse.H"
50
48
#include <boost/unordered_map.hpp>
51
49
#include <boost/unordered_set.hpp>
264
268
if(getRead(rid).getReadGroupID().hasTemplateInfo()){
265
269
// Ooooops ... empty template but readgroupinfo?
267
272
cout << "Read " << getRead(rid).getName() << " has template info given, but the internal template name is empty? This is fishy!" << endl;
273
if(++ol_emptytn==200){
269
274
cout << "More than 200 cases like the above, will not report more.\n";
278
283
if(++tidcounter[tnI->second]==2){
279
284
getRead(rid).setTemplatePartnerID(firstpartner);
280
285
getRead(firstpartner).setTemplatePartnerID(rid);
287
// look out for unknown segments ("0"), try to fix if possible, else dump error message
288
uint8 numunknownseg=(getRead(rid).getTemplateSegment()==0)+(getRead(firstpartner).getTemplateSegment()==0);
291
if(numunknownseg==1){
292
Read * tofix=&getRead(rid);
293
Read * other=&getRead(firstpartner);
294
uint8 segmenttoset=0;
295
if(tofix->getTemplateSegment()>0) swap(tofix,other);
296
if(other->getTemplateSegment()==1){
298
}else if(other->getTemplateSegment()==255){
303
tofix->setTemplateSegment(segmenttoset);
304
ol_unknownsegmentfixed++;
305
if(ol_unknownsegmentfixed<200){
306
cout << "Read " << tofix->getName() << ": fixed unrecognised template segment.\n";
308
if(++ol_unknownsegmentfixed==200){
309
cout << "More than 200 cases like the above, will not report more.\n";
314
if(ol_unknownsegment<200){
316
if(getRead(rid).getTemplateSegment()==0){
317
cout << "DNA template " << tnI->first << ", read " << getRead(rid).getName() << ": template segment not recognised.\n";
319
if(getRead(firstpartner).getTemplateSegment()==0){
320
cout << "DNA template " << tnI->first << ", read " << getRead(firstpartner).getName() << ": template segment not recognised.\n";
322
if(++ol_unknownsegment==200){
323
cout << "More than 200 cases like the above, will not report more.\n";
329
// the folowing is almost impossible when reading FASTA/FASTQ and other simple files,
330
// but it may happen in CAF/MAF (and maybe SAM?)
331
if(getRead(rid).getTemplateSegment()==getRead(firstpartner).getTemplateSegment()){
332
if(ol_samesegment<200){
334
cout << "Reads " << getRead(rid).getName() << " and " << getRead(firstpartner).getName() << " have the same template segments.\n";
335
if(++ol_samesegment==200){
336
cout << "More than 200 cases like the above, will not report more.\n";
282
// Ooooops ... not good: more than one read for this template
342
// Ooooops ... not good: more than two reads for this template
283
343
getRead(rid).setTemplatePartnerID(-1);
284
344
getRead(firstpartner).setTemplatePartnerID(-1);
286
347
cout << tidcounter[tnI->second] << " ";
287
348
cout << "DNA template " << tnI->first << " has more than two reads, template info not used. Read at fault: " << getRead(rid).getName() << endl;
349
if(++ol_gt2reads==200){
289
350
cout << "More than 200 cases like the above, will not report more.\n";
305
if(outlines>0 && (*REP_miraparams)[0].getNagAndWarnParams().nw_stop_templateproblems){
306
MIRANOTIFY(Notify::FATAL,"Problems found with the template data, see output log for more info. This points some serious problem either with read naming (like unrecognised read naming scheme) or broken template information, please fix your input files!\nOr switch off this warning with -NW:sote=no (but you'll do this at own risk!)")
366
if(outlines_fatal>0 && (*REP_miraparams)[0].getNagAndWarnParams().nw_check_templateproblems){
367
static string emsg="Problems found with the template data, see output log for more info. This points some serious problem either with read naming (like unrecognised read naming scheme) or broken template information, please fix your input files!\nOr switch off this warning with -NW:ctp=no (but you'll do this at own risk!)";
368
if((*REP_miraparams)[0].getNagAndWarnParams().nw_check_templateproblems==NWSTOP) {
369
MIRANOTIFY(Notify::FATAL,emsg);
371
cout << "WARNING!\n" << emsg << endl;
384
/*************************************************************************
388
*************************************************************************/
390
void ReadPool::checkTemplateIDs(const string & errmsg)
392
FUNCSTART("void ReadPool::checkTemplateIDs(string & errmsg)");
393
for(uint32 ri=0;ri< REP_thepool3.size(); ++ri){
394
if(REP_thepool3.getRead(ri).getTemplatePartnerID()>=0
395
&& REP_thepool3.getRead(ri).getTemplateID() != REP_thepool3.getRead(REP_thepool3.getRead(ri).getTemplatePartnerID()).getTemplateID()){
396
cout << "Ouch, template problem for read " << ri << " " << REP_thepool3.getRead(ri).getName() << ", dumping readpool for debug\n";
397
dumpAs(cout,Read::AS_TEXT,true);
398
BUGIFTHROW(true, errmsg);
319
403
/*************************************************************************
352
if(outlines>0 && (*REP_miraparams)[0].getNagAndWarnParams().nw_stop_duplicatereadnames){
353
MIRANOTIFY(Notify::FATAL,"Some read names were found more than once (see log above). This usually hints to a serious problem with your input and should really, really be fixed. You can choose to ignore this error with '-NW:sodrn=no', but this will almost certainly lead to problems with result files (ACE and CAF for sure, maybe also SAM) and probably to other unexpected effects.");
436
if(outlines>0 && (*REP_miraparams)[0].getNagAndWarnParams().nw_check_duplicatereadnames){
437
static string emsg="Some read names were found more than once (see log above). This usually hints to a serious problem with your input and should really, really be fixed. You can choose to ignore this error with '-NW:cdrn=no', but this will almost certainly lead to problems with result files (ACE and CAF for sure, maybe also SAM) and probably to other unexpected effects.";
438
if((*REP_miraparams)[0].getNagAndWarnParams().nw_check_duplicatereadnames==NWSTOP){
439
MIRANOTIFY(Notify::FATAL,emsg);
441
cout << "WARNING!\n" << emsg << endl;
383
471
loadDataFromFASTQ_rgid(filename1, fqqualoffset, rgid, countonly, callback);
384
472
}else if(filetype=="fasta"){
385
473
loadDataFromFASTA_rgid(filename1, rgid, true, optfilename2, countonly, callback);
386
}else if(filetype=="fna" || filetype == "fastanoqual"){ // FASTA, but no qual. fna to be used by users, fastanoqual internally
474
}else if(filetype=="fna" || filetype == "fastanoqual" || filetype=="fa"){ // FASTA, but no qual. fna to be used by users, fastanoqual internally
387
475
loadDataFromFASTA_rgid(filename1, rgid, false, "", countonly, callback);
388
476
}else if(filetype=="gbf"){
389
477
loadDataFromGBF_rgid(filename1, rgid, countonly, callback);
390
478
}else if(filetype=="gff3"){
391
479
loadDataFromGFF3_rgid(filename1, rgid, countonly, callback);
392
480
}else if(filetype=="fofnexp"){
393
loadDataFromEXPs_rgid(filename1, rgid, countonly, callback);
481
loadDataFromFOFNEXP_rgid(filename1, rgid, countonly, callback);
482
}else if(filetype=="exp"){
483
loadDataFromEXP_rgid(filename1, rgid, countonly, callback);
394
484
}else if(filetype=="caf"){
395
485
loadDataFromCAF_rgid(filename1, rgid, countonly, callback);
396
486
}else if(filetype=="maf"){
421
511
*************************************************************************/
423
size_t ReadPool::loadDataFromEXPs_rgid(const string & fofn, const ReadGroupLib::ReadGroupID rgid, bool countonly, void (*callback)(ReadPool &))
513
size_t ReadPool::loadDataFromFOFNEXP_rgid(const string & fofn, const ReadGroupLib::ReadGroupID rgid, bool countonly, void (*callback)(ReadPool &))
425
FUNCSTART("size_t ReadPool::loadEXPs(const string & fofn, const ReadGroupLib::ReadGroupID rgid, bool countonly, void (*callback)(ReadPool &))");
515
FUNCSTART("size_t ReadPool::loadDataFromFOFNEXP(const string & fofn, const ReadGroupLib::ReadGroupID rgid, bool countonly, void (*callback)(ReadPool &))");
427
517
string justfilename, justpath;
428
518
splitFullPathAndFileName(fofn,justpath,justfilename);
630
*************************************************************************/
632
size_t ReadPool::loadDataFromEXP_rgid(const string & filename, const ReadGroupLib::ReadGroupID rgid, bool countonly, void (*callback)(ReadPool &))
634
FUNCSTART("size_t ReadPool::loadEXP(const string & filename, const ReadGroupLib::ReadGroupID rgid, bool countonly, void (*callback)(ReadPool &))");
636
uint32 no_files_ok=0;
638
Read & actread = getRead(provideEmptyRead());
639
actread.setReadGroupID(rgid);
641
actread.loadDataFromEXP(filename,"");
642
actread.transferSVTagsToClip(20,60);
646
if(callback!=nullptr) {
653
n.handleError(THISFUNC);
662
/*************************************************************************
539
666
*************************************************************************/
540
667
size_t ReadPool::loadDataFromFASTQ_rgid(const string & filename, const ReadGroupLib::ReadGroupID rgid, bool countonly, void (*callback)(ReadPool &))
1869
1996
/*************************************************************************
1871
1998
* like above, but using save()
1872
* This is for convert_project
1999
* This is for miraconvert
1874
2001
*************************************************************************/
1875
2002
void ReadPool::saveAsMAF(ostream & ostr, bool alsoinvalids) const