2
* Written by Bastien Chevreux (BaCh)
4
* Copyright (C) 1997-2000 by the German Cancer Research Center (Deutsches
5
* Krebsforschungszentrum, DKFZ Heidelberg) and Bastien Chevreux
6
* Copyright (C) 2000 and later by Bastien Chevreux
10
* This program is free software; you can redistribute it and/or
11
* modify it under the terms of the GNU General Public License
12
* as published by the Free Software Foundation; either version 2
13
* of the License, or (at your option) any later version.
15
* This program is distributed in the hope that it will be useful,
16
* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
* GNU General Public License for more details.
20
* You should have received a copy of the GNU General Public License
21
* along with this program; if not, write to the
22
* Free Software Foundation, Inc.,
23
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
29
#include <boost/unordered_set.hpp>
30
#include <boost/lexical_cast.hpp>
32
//#include <valgrind/callgrind.h>
34
#include "io/generalio.H"
35
#include "util/fileanddisk.H"
36
#include "mira/parameters.H"
37
#include "mira/assembly.H"
38
#include "mira/assembly_output.H"
39
#include "mira/maf_parse.H"
40
#include "mira/sam_collect.H"
41
#include "mira/gff_save.H"
42
#include "mira/hashstats.H"
45
#include "progs/quirks.H"
48
#include "memorc/memorc.H"
59
typedef boost::unordered_map<std::string, size_t> strintmap;
60
static strintmap GE_nameselectionmap;
63
static void makeSelectionStringSet(string & filename);
64
static bool checkNamePresence(string & name);
65
static bool hasNames();
66
static size_t getNameOrder(const string & name);
69
General::strintmap General::GE_nameselectionmap;
72
void General::makeSelectionStringSet(string & filename)
74
FUNCSTART("void makeSelectionStringSet(string & filename)");
77
fin.open(filename.c_str(), ios::in);
79
MIRANOTIFY(Notify::FATAL, "File not found: " << filename);
81
fin.seekg(0, ios::beg);
83
string elemname, dummy;
84
strintmap::iterator nI;
86
while(GeneralIO::readKeyValue(fin, elemname,dummy)){
87
nI=GE_nameselectionmap.find(elemname);
88
if(nI==GE_nameselectionmap.end()) {
89
GE_nameselectionmap[elemname]=numread;
95
if(GE_nameselectionmap.empty()) {
105
bool General::checkNamePresence(string & name)
107
if(GE_nameselectionmap.empty()) return true;
108
return (GE_nameselectionmap.find(name) != GE_nameselectionmap.end());
111
bool General::hasNames()
113
return !GE_nameselectionmap.empty();
116
size_t General::getNameOrder(const string & name)
118
if(GE_nameselectionmap.empty()) return (0-1);
119
strintmap::iterator nI=GE_nameselectionmap.find(name);
120
if (nI == GE_nameselectionmap.end()) return (0-1);
132
static ofstream TS_fout;
134
static list<Contig> TS_clist; // needed for CAF conversion (and GBF)
140
static void doit2(list<Contig> & contigs);
141
static void saveCList(list<Contig> & clist, ReadPool & rp);
142
static void cafload_callback(list<Contig> & clist, ReadPool & rp);
145
int main(int argc, char ** argv);
149
list<Contig> tagsnp::TS_clist;
150
ofstream tagsnp::TS_fout;
154
cerr << "tagsnp\t(MIRALIB version " << MIRALIBVERSION << ")\n\n";
156
cerr << " tagsnp [-xxx] cafin cafout [optional MIRA settings]\n\n";
157
cerr << "Options:\n";
158
cerr << "\t-n\t\tnuke all existing SNP and RMB tags in file\n";
159
// cerr << "\t-s <filename>\tload strain data from file\n";
160
// cerr << "\t-a\t\tassume SNPs instead of repeats\n";
161
// cerr << "\t-r <int>\tminimum reads per group (default: 1)\n";
162
// cerr << "\t-q <int>\tminimum qual for tagging (default: 30)\n";
163
// cerr << "\t-n <int>\tminimum neighbour qual for tagging (default: 20)\n";
164
// cerr << "\t-e <int>\tend read exclusion area (default: 25)\n";
165
// cerr << "\t-g\t\talso mark gap bases\n";
166
// cerr << "\t-m\t\talso mark multicolumn gap bases\n";
171
//void tagsnp::save(string & cafout)
174
// if(!cafout.empty()){
175
// assout::saveAsCAF(contigs,cafout);
177
// assout::dumpAsCAF(contigs,cout);
181
// //filename="out.tcs";
182
// //assout::saveAsTCS(contigs,filename);
184
// //filename="tagsnp_out.gap4da";
185
// //assout::saveAsGAP4DA(contigs,filename);
187
// //filename="featureanalysis.txt";
188
// //assout::saveFeatureAnalysis(400,100,contigs,readpool,
190
// // "featuresummary.txt",
191
// // "featureprot.txt");
194
// // string filename="out.html";
195
// // cout << "Saving contigs to htmlfile: " << filename << endl;
196
// // ofstream out(filename.c_str(), ios::out | ios::trunc);
197
// // assout::dumpContigListAsHTML(contigs, "Super project", out);
202
//void tagsnp::load (MIRAParameters * mp, string & cafin, string & strainin)
204
// cerr << "Loading project from CAF file: " << cafin << endl;
206
// CAF tcaf(readpool, contigs, mp);
209
// if(!strainin.empty()){
210
// cerr << "Loading strain data";
211
// readpool.loadStrainData(strainin);
214
// Assembly::refreshContigAndReadpoolValuesAfterLoading(readpool,contigs);
217
//void tagsnp::doit(list<Contig> & contigs)
219
// cout << "Tagging reads ..." << endl;
220
// list<Contig>::iterator I=contigs.begin();
221
// for(;I!=contigs.end(); I++){
222
// //I->setParams(&P);
224
// //uint32 numSRMB=0;
225
// //uint32 numWRMB=0;
226
// //uint32 numSNP=0;
227
// //I->transposeReadSRMTagsToContig();
228
// ////I->markPossibleRepeats(numSRMB, numWRMB, numSNP);
229
// //vector<bool> readsmarkedsrm;
230
// //I->newMarkPossibleRepeats(numSRMB,readsmarkedsrm);
235
void tagsnp::saveCList(list<Contig> & clist, ReadPool & rp)
237
Contig::setCoutType(Contig::AS_CAF);
238
list<Contig>::iterator I=clist.begin();
239
for(;I!=clist.end(); I++){
244
void tagsnp::cafload_callback(list<Contig> & clist, ReadPool & rp)
248
Assembly::refreshContigAndReadpoolValuesAfterLoading(rp,clist);
249
clist.back().trashConsensusCache(false);
252
saveCList(clist, rp);
258
void tagsnp::doit2(list<Contig> & contigs)
260
cout << "Tagging reads ..." << endl;
261
list<Contig>::iterator I=contigs.begin();
262
for(;I!=contigs.end(); I++){
263
I->trashConsensusCache(false);
265
Contig::repeatmarker_stats_t repstats;
266
vector<bool> readsmarkedsrm;
267
I->newMarkPossibleRepeats(repstats, readsmarkedsrm);
269
I->markFeaturesByConsensus(true,true,true);
275
int tagsnp::main(int argc, char ** argv)
277
FUNCSTART("int main(int argc, char ** argv)");
279
vector<MIRAParameters> Pv;
280
MIRAParameters::setupStdMIRAParameters(Pv);
282
const_cast<contig_parameters &>(Pv[0].getContigParams()).con_disregard_spurious_rmb_mismatches=false;
284
ReadPool thepool(&Pv);
295
c = getopt(argc, argv, "+h");
308
if(argc-optind < 2) {
309
cerr << argv[0] << ": " << "Missing at least infile or outfile as argument!\n";
314
string infile=argv[optind++];
315
string outfile=argv[optind++];
317
if(argc-optind > 0) {
319
for(int32 i=optind; i<argc; i++) tss << argv[i] << " *=BEGIN0=*";
320
MIRAParameters::parse(tss,Pv,nullptr);
323
MIRAParameters::dumpAllParams(Pv, cout);
325
TS_fout.open(outfile.c_str(), ios::out);
327
MIRANOTIFY(Notify::FATAL, "Could not open file for saving: " << outfile);
331
vector<uint32> dummy;
332
CAF tcaf(&thepool, &TS_clist, &Pv);
334
ReadGroupLib::SEQTYPE_SANGER,
341
//load(&P, infile, strainin);
348
n.handleError("main");
351
cerr << "Unexpected exception: Flow()\n";
370
static vector<MIRAParameters> CP_Pv;
372
static string CP_fromtype;
373
static list<string> CP_totype;
375
static list<ofstream *> CP_ofs;
378
static string CP_infile;
379
static string CP_outbasename;
381
static string CP_renamesequences;
382
static string CP_renamenamescheme;
384
static bool CP_splitcontigs2singlefiles;
386
static bool CP_deletestaronlycolumns;
387
static bool CP_blinddata;
388
static bool CP_fillholesinstraingenomes;
389
static bool CP_makecontigs;
390
static bool CP_extractreadsinsteadcontigs;
391
static bool CP_hardtrim;
393
static string CP_namefile;
394
static bool CP_keepnamesfromfile;
395
static bool CP_sortbyname;
397
static bool CP_mustdeletetargetfiles;
399
static bool CP_specialtestcode;
401
static bool CP_filter2readgroup;
403
static base_quality_t CP_minqual;
404
static bool CP_needsquality;
405
static base_quality_t CP_defaultqual;
407
static char CP_recalcconopt;
408
static char CP_recalcfeatureopt;
410
static uint32 CP_minbasecoverage;
412
static uint32 CP_mincontiglength;
413
static bool CP_minlengthisclipped;
415
static uint32 CP_mincontigcoverage;
416
static uint32 CP_minnumreads;
418
static list<Contig> CP_clist; // needed for CAF & MAF conversion (and GBF)
419
static AssemblyInfo CP_assemblyinfo;
421
static uint64 CP_readrenamecounter;
422
static GFFSave CP_gffsave;
423
static SAMCollect CP_samcollect;
427
static void checkTypes(const string & fromtype,list<string> & totype);
428
static void putReadsInContigsAndSave(vector<MIRAParameters> & Pv, ReadPool & rp);
429
static void discardShortReads(vector<MIRAParameters> & Pv, ReadPool & rp, uint32 minlength, bool fromclipped);
430
static void specialTestCode(list<Contig> & clist, ReadPool & rp);
432
static void filterToReadGroup(ReadPool & rp);
434
static bool contig__nameordercomp(const Contig & a, const Contig & b);
435
static void sortContigsByName(list<Contig> & clist);
436
static void sortPoolByName(ReadPool & rp, string & filename);
438
static void saveContigList(list<Contig> & clist, ReadPool & rp);
439
static void saveContigList_helper(list<Contig> & clist, ReadPool & rp);
441
static void saveReadPool(ReadPool & rp, list<ofstream *> & ofs);
442
static void cafmafload_callback(list<Contig> & clist, ReadPool & rp);
443
static void readpoolload_callback(ReadPool & rp);
444
static string createFileNameFromBasePostfixContigAndRead(const string & basename,
446
Contig * actcon = nullptr,
447
Read * actread = nullptr);
448
static uint32 openOFSlist(Contig * optcontig, list<ofstream *> & ofs);
449
static void closeOFSList(uint32 howmany, list<ofstream *> & ofs);
454
int main2(int argc, char ** argv);
456
static void closeOpenStreams(list<ofstream *> & ofsl);
458
static bool checkForFromType(const string & ftype);
459
static bool checkForToType(const string & ttype);
460
static void guessFromAndToType(const string & fnamefrom,
463
const string & fnameto,
464
list<string> & totypes,
468
vector<MIRAParameters> ConvPro::CP_Pv;
470
string ConvPro::CP_fromtype;
471
list<string> ConvPro::CP_totype;
472
list<ofstream *> ConvPro::CP_ofs;
474
string ConvPro::CP_infile;
475
string ConvPro::CP_outbasename;
477
string ConvPro::CP_renamesequences;
478
string ConvPro::CP_renamenamescheme;
480
bool ConvPro::CP_splitcontigs2singlefiles=false;
481
bool ConvPro::CP_deletestaronlycolumns=false;
482
bool ConvPro::CP_blinddata=false;
483
bool ConvPro::CP_fillholesinstraingenomes=false;
484
bool ConvPro::CP_makecontigs=false;
485
bool ConvPro::CP_extractreadsinsteadcontigs=false;
486
bool ConvPro::CP_hardtrim=false;
488
string ConvPro::CP_namefile;
489
bool ConvPro::CP_sortbyname=false;
490
bool ConvPro::CP_keepnamesfromfile=true;
492
bool ConvPro::CP_mustdeletetargetfiles=true;
494
bool ConvPro::CP_specialtestcode=false;
495
bool ConvPro::CP_filter2readgroup=false;
497
base_quality_t ConvPro::CP_minqual=0;
498
bool ConvPro::CP_needsquality=true;
499
base_quality_t ConvPro::CP_defaultqual=30;
501
char ConvPro::CP_recalcconopt=' ';
502
char ConvPro::CP_recalcfeatureopt=' ';
504
uint32 ConvPro::CP_minbasecoverage=0;
506
uint32 ConvPro::CP_mincontiglength=0;
507
bool ConvPro::CP_minlengthisclipped=false;
508
uint32 ConvPro::CP_mincontigcoverage=1;
509
uint32 ConvPro::CP_minnumreads=0;
511
list<Contig> ConvPro::CP_clist; // needed for CAF conversion (and GBF)
512
AssemblyInfo ConvPro::CP_assemblyinfo;
515
uint64 ConvPro::CP_readrenamecounter=1;
517
GFFSave ConvPro::CP_gffsave;
518
SAMCollect ConvPro::CP_samcollect;
524
closeOpenStreams(CP_ofs);
527
void ConvPro::usage()
529
cout << "convert_project\t(MIRALIB version " << MIRAVERSION << ")\n"
530
"Author: Bastien Chevreux\t(bach@chevreux.org)\n"
531
"Purpose: convert assembly and sequencing file types.\n\n";
533
"convert_project [-f <fromtype>] [-t <totype> [-t <totype> ...]]\n"
535
"\t[-AcflnNoqrtvxXyz {...}]\n"
536
"\t{infile} {outfile} [<totype> <totype> ...]\n\n";
537
cout << "Options:\n";
539
"\t-f <fromtype>\tload this type of project files, where fromtype is:\n"
540
"\t caf\t\t a complete assembly or single sequences from CAF\n"
541
"\t maf\t\t a complete assembly or single sequences from CAF\n"
542
"\t fasta\t sequences from a FASTA file\n"
543
"\t fastq\t sequences from a FASTQ file\n"
544
"\t gbf\t\t sequences from a GBF file\n"
545
"\t phd\t\t sequences from a PHD file\n"
546
"\t fofnexp\t sequences in EXP files from file of filenames\n";
547
cout << "\t-t <totype>\twrite the sequences/assembly to this type (multiple\n"
548
"\t\t\tmentions of -t are allowed):\n"
549
"\t ace\t\t sequences or complete assembly to ACE\n"
550
"\t caf\t\t sequences or complete assembly to CAF\n"
551
"\t maf\t\t sequences or complete assembly to MAF\n"
552
"\t sam\t\t complete assembly to SAM\n"
553
"\t samnbb\t like above, but leaving out reference (backbones) in mapping assemblies\n"
554
"\t gbf\t\t sequences or consensus to GBF\n"
555
"\t gff3\t\t consensus to GFF3\n"
556
"\t wig\t\t assembly coverage info to wiggle file\n"
557
"\t gcwig\t assembly gc content info to wiggle file\n"
558
"\t fasta\t sequences or consensus to FASTA file (qualities to\n"
560
"\t fastq\t sequences or consensus to FASTQ file\n"
561
"\t exp\t\t sequences or complete assembly to EXP files in\n"
562
"\t\t\t directories. Complete assemblies are suited for gap4\n"
563
"\t\t\t import as directed assembly.\n"
564
"\t\t\t Note: using caf2gap to import into gap4 is recommended though\n"
565
"\t text\t\t complete assembly to text alignment (only when -f is\n"
566
"\t\t\t caf, maf or gbf)\n"
567
"\t html\t\t complete assembly to HTML (only when -f is caf, maf or\n"
569
"\t tcs\t\t complete assembly to tcs\n"
570
"\t hsnp\t\t surrounding of SNP tags (SROc, SAOc, SIOc) to HTML\n"
571
"\t\t\t (only when -f is caf, maf or gbf)\n"
572
"\t asnp\t\t analysis of SNP tags\n"
573
"\t\t\t (only when -f is caf, maf or gbf)\n"
574
"\t cstats\t contig statistics file like from MIRA\n"
575
"\t\t\t (only when source contains contigs)\n"
576
"\t crlist\t contig read list file like from MIRA\n"
577
"\t\t\t (only when source contains contigs)\n"
578
"\t maskedfasta\t reads where sequencing vector is masked out\n"
579
"\t\t\t (with X) to FASTA file (qualities to .qual)\n"
580
"\t scaf\t\t sequences or complete assembly to single sequences CAF\n";
582
cout << "\t-a\t\tAppend to target files instead of rewriting\n";
585
"\t-A <string>\tString with MIRA parameters to be parsed\n"
586
"\t\t\t Useful when setting parameters affecting consensus\n"
587
"\t\t\t calling like -CO:mrpg etc.\n"
588
"\t\t\t E.g.: -a \"454_SETTINGS -CO:mrpg=3\"\n";
591
"\t-b\t\tBlind data\n"
592
"\t\t\t Replaces all bases in reads/contigs with a 'c'\n";
594
cout << "\t-C\t\tPerform hard clip to reads\n"
595
"\t\t\t When reading formats which define clipping points, will\n"
596
"\t\t\t save only the unclipped part into the result file.\n"
597
"\t\t\t Applies only to files/formats which do not contain\n"
601
"\t-d\t\tDelete gap only columns\n"
602
"\t\t\t When output is contigs: delete columns that are\n"
603
"\t\t\t entirely gaps (like after having deleted reads during\n"
604
"\t\t\t editing in gap4 or similar)\n"
605
"\t\t\t When output is reads: delete gaps in reads\n";
608
"\t-F\t\tFilter to read groups\n"
609
"\t\t\t Special use case, do not use yet.\n";
613
"\t-m\t\tMake contigs (only for -t = caf or maf)\n"
614
"\t\t\t Encase single reads as contig singlets into the CAF/MAF\n"
617
"\t-n <filename>\twhen given, selects only reads or contigs given by\n"
618
"\t\t\t name in that file.\n";
620
"\t-i\t\twhen -n is used, inverts the selection\n";
622
"\t-o\t\tfastq quality Offset (only for -f = 'fastq')\n"
623
"\t\t\t Offset of quality values in FASTQ file. Default of 0\n"
624
"\t\t\t tries to automatically recognise.\n";
627
"\t-Q <quality>\t\tSet default quality for bases in file types without quality values\n"
628
"\t\t\t Furthermore, do not stop if expected quality files are missing (e.g. '.fasta')\n";
631
"\t-R <name>\tRename contigs/singlets/reads with given name string\n"
632
"\t\t\t to which a counter is appended.\n"
633
"\t\t\t Known bug: will create duplicate names if input\n"
634
"\t\t\t contains contigs/singlets as well as free reads, i.e.\n"
635
"\t\t\t reads not in contigs nor singlets.\n";
638
"\t-S <name>\t(name)Scheme for renaming reads, important for paired-ends\n"
639
"\t\t\t Only 'solexa' is currently supported.\n";
642
// TODO: re-adapt these switches to >2.9.8
643
// cout << "\t-s <filename>\twhen loading assemblies from files that do not contain\n";
644
// cout << "\t\t\t strain information (e.g. CAF), load the strain\n";
645
// cout << "\t\t\t information from this file. (2 columns, tab delimited:\n";
646
// cout << "\t\t\t readname left, strain name right)\n";
652
"\n\t--------------------------------------------------------\n"
653
"\tThe following switches work only when input (CAF or MAF)\n"
654
"\tcontains contigs. Beware: CAF and MAf can also contain\n"
656
"\t--------------------------------------------------------\n\n";
658
// TODO: check if ok for >2.9.8
660
"\t-M\t\tDo not extract contigs (or their consensus), but the\n"
661
"\t\t\t sequence of the reads they are composed of.\n";
663
"\t-N <filename>\tlike -n, but sorts output according to order given\n"
666
"\t-r [cCqf]\tRecalculate consensus and / or consensus quality values\n"
667
"\t\t\t and / or SNP feature tags.\n"
668
"\t\t\t 'c' recalc cons & cons qualities (with IUPAC)\n"
669
"\t\t\t 'C' recalc cons & cons qualities (forcing non-IUPAC)\n"
670
"\t\t\t 'q' recalc consensus qualities only\n"
671
"\t\t\t 'f' recalc SNP features\n"
672
"\t\t\t Note: only the last of cCq is relevant, f works as a\n"
673
"\t\t\t switch and can be combined with cQq (e.g. \"-r C -r f\")\n"
674
"\t\t\t Note: if the CAF/MAF contains multiple strains,\n"
675
"\t\t\t recalculation of cons & cons qualities is forced, you\n"
676
"\t\t\t can just influence whether IUPACs are used or not.\n";
678
"\t-s\t\tsplit output into multiple files instead of creating a\n"
679
"\t\t\t single file\n";
681
"\t-u\t\t'fillUp strain genomes'\n"
682
"\t\t\t Fill holes in the genome of one strain (N or @)\n"
683
"\t\t\t with sequence from a consensus of other strains\n"
684
"\t\t\t Takes effect only with -r and -t gbf or fasta/q\n"
685
"\t\t\t in FASTA/Q: bases filled up are in lower case\n"
686
"\t\t\t in GBF: bases filled up are in upper case\n";
689
"\t-q <integer>\tDefines minimum quality a consensus base of a strain\n"
690
"\t\t\t must have, consensus bases below this will be 'N'\n"
691
"\t\t\t Default: 0\n"
692
"\t\t\t Only used with -r, and -f is caf/maf and -t is (fasta\n"
695
// "\t-v <integer>\tDefines minimum coverage a consensus base of a strain\n"
696
// "\t\t\t must have, bases with coverage below this will be 'N'\n"
697
// "\t\t\t Default: 0\n"
698
// "\t\t\t Only used with -r, and -t is (fasta\n"
699
// "\t\t\t or gbf)\n";
702
"\t-v\t\tPrint version number and exit\n";
705
"\t-x <integer>\tMinimum contig or read length\n"
706
"\t\t\t When loading, discard all contigs / reads with a\n"
707
"\t\t\t length less than this value. Default: 0 (=switched off)\n"
708
"\t\t\t Note: not applied to reads in contigs!\n";
710
"\t-X <integer>\tSimilar to -x but applies only to reads and\n"
711
"\t\t\t then to the clipped length.\n";
714
"\t-y <integer>\tMinimum average contig coverage\n"
715
"\t\t\t When loading, discard all contigs with an\n"
716
"\t\t\t average coverage less than this value.\n"
717
"\t\t\t Default: 1\n";
720
"\t-z <integer>\tMinimum number of reads in contig\n"
721
"\t\t\t When loading, discard all contigs with a\n"
722
"\t\t\t number of reads less than this value.\n"
723
"\t\t\t Default: 0 (=switched off)\n";
727
"\t-l <integer>\twhen output as text or HTML: number of bases shown in\n"
728
"\t\t\t one alignment line. Default: 60.\n"
729
"\t-c <character>\twhen output as text or HTML: character used to pad\n"
730
"\t\t\t endgaps. Default: ' ' (blank)\n";
732
cout << "\nAliases:\n"
733
"caf2html, exp2fasta, ... etc. Any combination of \"<validfromtype>2<validtotype>\"\ncan be used as program name (also using links) so as that convert_project\nautomatically sets -f and -t accordingly.\n";
735
cout << "\nExamples:\n"
736
"\tconvert_project source.maf dest.sam\n"
737
"\tconvert_project source.caf dest.fasta wig ace\n"
738
"\tconvert_project -x 2000 -y 10 source.caf dest.caf\n"
739
"\tcaf2html -l 100 -c . source.caf dest\n";
743
bool ConvPro::checkForFromType(const string & ftype)
745
static set<string> ftypes={
757
return ftypes.find(ftype)!=ftypes.end();
760
bool ConvPro::checkForToType(const string & ttype)
762
static set<string> ttypes={
790
return ttypes.find(ttype)!=ttypes.end();
793
void ConvPro::checkTypes(const string & fromtype,list<string> & totype)
795
if(!checkForFromType(fromtype)){
798
cerr << "Unknown or illegal file type '" << fromtype << "' defined as <fromtype>\n";
801
if(CP_totype.empty()){
802
CP_totype.push_back(fromtype);
804
for(list<string>::iterator ttI= CP_totype.begin(); ttI!=CP_totype.end(); ++ttI){
805
if(!checkForToType(*ttI)){
808
cerr << "ConvPro::checkTypes: Unknown or illegal file type '" << *ttI << "' defined as <totype>\n";
814
// comfort function: parse fromtype and totype from name of infile, outfile
815
void ConvPro::guessFromAndToType(const string & fnamefrom, string & fromtype, string * fromstem, const string & fnameto, list<string> & totypes, string * tostem)
820
guessFileAndZipType(fnamefrom,dummystem,ft,ziptype);
821
if(fromtype.empty()){
823
if(fromstem != nullptr) fromstem->swap(dummystem);
828
guessFileAndZipType(fnameto,dummystem,ft,ziptype);
829
if(!ft.empty() && checkForToType(ft)){
830
totypes.push_back(ft);
831
if(tostem != nullptr) tostem->swap(dummystem);
837
void ConvPro::filterToReadGroup(ReadPool & rp)
839
FUNCSTART("void ConvPro::filterToReadGroup(ReadPool & rp)");
841
vector<ofstream> ofspassed(ReadGroupLib::getNumReadGroups());
842
vector<ofstream> ofswidow(ReadGroupLib::getNumReadGroups());
843
vector<ofstream> ofsdebris(ReadGroupLib::getNumReadGroups());
845
deque<string> seenrgnames;
846
for(uint32 rglid=1; rglid<ReadGroupLib::getNumReadGroups(); ++rglid){
847
auto rgid=ReadGroupLib::getReadGroupID(rglid);
848
string rgname(rgid.getGroupName());
850
rgname=string("readgroup")+boost::lexical_cast<string>(rglid)+"_passed.fastq";
852
for(auto & sn : seenrgnames){
854
cout << "\nReadgroup '"<<sn<<"' seen more than once in MAF file, this is illegal.\n";
858
seenrgnames.push_back(rgname);
859
string pname(rgname+"_passed.fastq");
860
ofspassed[rglid].open(pname,ios::out);
861
pname=rgname+"_widow.fastq";
862
ofswidow[rglid].open(pname,ios::out);
863
pname=rgname+"_debris.fastq";
864
ofsdebris[rglid].open(pname,ios::out);
867
Read::setCoutType(Read::AS_FASTQ);
869
rp.makeTemplateIDs(true);
871
vector<uint32> sortdummy;
872
rp.sortPoolToMIRAStandard(sortdummy);
874
vector<bool> hasmultisegmentemplates(ReadGroupLib::getNumReadGroups(),false);
875
for(uint32 rpi=0; rpi< rp.size(); ++rpi){
876
if(rp[rpi].getTemplatePartnerID()!=-1){
877
hasmultisegmentemplates[rp[rpi].getReadGroupID().getLibId()]=true;
880
hasmultisegmentemplates[0]=false;
882
for(uint32 rpi=0; rpi< rp.size(); ){
885
for(; lasti<rp.size() && rp[rpi].getTemplateID()==rp[lasti].getTemplateID(); ++lasti){
886
if(CP_mincontiglength>0 && rp[lasti].getLenSeq()<CP_mincontiglength) allok=false;
888
auto numseq=lasti-rpi;
889
if(hasmultisegmentemplates[rp[rpi].getReadGroupID().getLibId()] && numseq<2) allok=false;
890
for(;rpi<lasti; ++rpi){
892
ofspassed[rp[rpi].getReadGroupID().getLibId()] << rp[rpi];
894
ofsdebris[rp[rpi].getReadGroupID().getLibId()] << rp[rpi];
896
if(CP_mincontiglength>0 && rp[rpi].getLenSeq()>=CP_mincontiglength){
897
ofswidow[rp[rpi].getReadGroupID().getLibId()] << rp[rpi];
899
ofsdebris[rp[rpi].getReadGroupID().getLibId()] << rp[rpi];
908
#define CEBUG(bla) //{cout << bla; cout.flush();}
909
void ConvPro::specialTestCode(list<Contig> & clist, ReadPool & rp)
911
FUNCSTART("void ConvPro::specialTestCode(list<Contig> & clist, ReadPool & rp)");
913
for(auto cI=clist.begin(); cI!=clist.end(); ++cI){
914
cI->findBestNonMisassembledRange();
915
cout << "CHecking: " << cI->getContigName() << endl;
916
if(cI->getContigLength()>=1000){
917
auto p=cI->findBestPairConsistencyRange();
918
if(p.first>=0) cout << "Found best range: " << cI->getContigName() << "\t" << p.first << "\t" << p.second << "\t" << p.second-p.first << endl;
923
auto range=cI->findBestNonMisassembledRange();
925
cout<<"Found misassembly by repeat marker. Best range: " << range.first << ".." << range.second << '\t' << range.second-range.first << endl;
926
cI->trimContigToRange(range.first,range.second);
932
coverageinfo_t cinfo;
933
vector<uint64> covvals;
934
cI->collectCoverage(covvals);
935
cI->calcStatsOnContainer(cinfo,covvals);
936
cout << "1st covnum: " << cinfo << endl;
938
// TODO: perhaps make this dependend of ratio mean vs stddev ?
939
cI->calcSecondOrderStatsOnContainer(cinfo,covvals);
940
cout << "2nd covnum: " << cinfo << endl;
942
vector<uint8> peakindicator;
943
cI->findPeaks(cinfo.median,peakindicator);
944
unordered_set<readid_t> readsremoved;
945
cI->reduceReadsAtCoveragePeaks(cinfo.median,peakindicator,readsremoved);
946
cout << "Removed " << readsremoved.size() << endl;
950
// vector<bool> readsmarkedsrm;
952
// cout << "\nMarking tricky 454 / Solexa overcalls in temporary contig.\n";
953
// cout << "Marked " << I->editTrickyOvercalls(true,false,readsmarkedsrm) << " reads.\n";
954
// bool newreptmarked=Assembly::markRepeats(*I, readsmarkedsrm);
955
// cout << "Edited " << I->editTrickyOvercalls(false,false,readsmarkedsrm) << " reads.\n";
956
// I->deleteStarOnlyColumns(0, I->getContigLength()-1);
957
// I->deleteTagsInReads(Read::REA_defaulttag_PSHP.identifier);
959
// I->markFeaturesByConsensus(true,true,true);
960
// I->editSingleDiscrepancyNoHAFTag(readsmarkedsrm);
961
// assout::saveReadTagList(*I,
966
// uint32 numcoledits=0;
967
// uint32 numreadedits=0;
968
// for(uint32 i=0; i<5; ++i){
969
// I->editPBSledgeHammer(readsmarkedsrm,numcoledits,numreadedits);
970
// cout <<"### i " << i << " " << numcoledits << " " << numreadedits << endl;
971
// I->deleteStarOnlyColumns(0,I->getContigLength());
972
// //I->upDownCase(5);
978
// for(auto & cle : clist){
979
// auto & cr=cle.getContigReads();
980
// for(auto & cre : cr){
981
// Read & r=const_cast<Read &>(cre);
982
// if(r.isBackbone()){
983
// cout << "Sorting " << r.getName() << endl;
984
// r.sortTagsForGFF3();
990
n.handleError(THISFUNC);
998
void ConvPro::putReadsInContigsAndSave(vector<MIRAParameters> & Pv, ReadPool & rp)
1000
for(uint32 i=0; i<rp.size(); i++) {
1001
if(!rp[i].hasQuality()){
1002
base_quality_t bq=30;
1003
if(rp[i].getReadGroupID().getDefaultQual()<=100) bq=rp[i].getReadGroupID().getDefaultQual();
1004
rp[i].setQualities(bq);
1007
Contig con(&Pv, rp);
1008
CP_clist.push_back(con);
1009
CP_clist.back().addFirstRead(i,1);
1010
CP_clist.back().setContigName(rp[i].getName()+"_contig");
1011
saveContigList_helper(CP_clist, rp);
1016
void ConvPro::discardShortReads(vector<MIRAParameters> & Pv, ReadPool & rp, uint32 minlength, bool fromclipped)
1018
for(uint32 i=0; i<rp.size(); i++) {
1021
len=rp[i].getLenClippedSeq();
1023
len=rp[i].getLenSeq();
1025
if(len<minlength) rp[i].discard();
1031
//string ConvPro::createFileNameFromBasePostfixContigAndRead(const string & basename, string & postfix, Contig * actcon, Read * actread)
1032
string ConvPro::createFileNameFromBasePostfixContigAndRead(const string & basename, char * postfix, Contig * actcon, Read * actread)
1034
string filename=basename;
1035
if(actcon != nullptr){
1036
if(!filename.empty()) filename+='_';
1037
filename+=actcon->getContigName();
1038
}else if(actread != nullptr){
1039
if(!filename.empty()) filename+='_';
1040
filename+=actread->getName();
1047
bool ConvPro::contig__nameordercomp(const Contig & a, const Contig & b)
1049
return General::getNameOrder(a.getContigName()) < General::getNameOrder(b.getContigName());
1053
void ConvPro::sortContigsByName(list<Contig> & clist)
1055
clist.sort(contig__nameordercomp);
1059
void ConvPro::sortPoolByName(ReadPool & rp, string & filename)
1061
FUNCSTART("void ConvPro::sortPoolByName(ReadPool & rp, string & filename)");
1063
cout << "Sorting pool ..."; cout.flush();
1065
rp.allowNameIndex(true);
1068
fin.open(filename.c_str(), ios::in);
1070
MIRANOTIFY(Notify::FATAL, "File not found: " << filename);
1072
fin.seekg(0, ios::beg);
1074
vector<uint32> newsortorder;
1076
string elemname, dummy;
1078
while(GeneralIO::readKeyValue(fin, elemname,dummy)){
1079
int32 newpos=rp.getReadIndex(elemname);
1080
if(newpos>=0) newsortorder.push_back(static_cast<uint32>(newpos));
1084
rp.sortPoolToGivenOrder(newsortorder);
1086
rp.allowNameIndex(false);
1093
void ConvPro::saveContigList_helper(list<Contig> & clist, ReadPool & rp)
1095
FUNCSTART("void ConvPro::saveContigList_helper(list<Contig> & clist, ReadPool & rp)");
1097
if(CP_specialtestcode) specialTestCode(clist,rp);
1100
// cout << "CLISTSIZE: " << clist.size() << endl;
1101
// list<Contig>::iterator cI=clist.begin();
1102
// for(; cI != clist.end(); cI++){
1103
// cout << "cname: " << cI->getContigName() << endl;
1107
BUGIFTHROW(!CP_ofs.empty() && CP_ofs.size() != CP_totype.size(), "Ooops? !CP_ofs.empty() && CP_ofs.size() != CP_totype.size() ???");
1109
list<ofstream *>::iterator ofsI= CP_ofs.begin();
1110
list<string>::iterator ttI= CP_totype.begin();
1111
for(; ttI!=CP_totype.end(); ++ttI, ++ofsI){
1114
}else if(*ttI=="scaf"){
1115
//clear_conandrp=false;
1116
}else if(*ttI=="hsnp"){
1117
MIRAParameters::generateProjectOutNames(CP_Pv,CP_outbasename);
1119
if(CP_splitcontigs2singlefiles){
1120
fn=createFileNameFromBasePostfixContigAndRead(
1121
CP_Pv[0].getAssemblyParams().as_outfile_stats_snpenvironment,
1125
fn=createFileNameFromBasePostfixContigAndRead(
1126
CP_Pv[0].getAssemblyParams().as_outfile_stats_snpenvironment,
1129
assout::saveSNPSurroundingAsHTML(clist,fn,CP_mustdeletetargetfiles);
1130
}else if(*ttI=="cstats"){
1131
MIRAParameters::generateProjectOutNames(CP_Pv,CP_outbasename);
1133
if(CP_splitcontigs2singlefiles){
1134
fn=createFileNameFromBasePostfixContigAndRead(
1135
CP_Pv[0].getAssemblyParams().as_outfile_stats_contigstats,
1139
fn=createFileNameFromBasePostfixContigAndRead(
1140
CP_Pv[0].getAssemblyParams().as_outfile_stats_contigstats,
1143
assout::saveStatistics(clist,fn,CP_mustdeletetargetfiles);
1144
if(CP_splitcontigs2singlefiles){
1145
fn=createFileNameFromBasePostfixContigAndRead(
1146
CP_Pv[0].getAssemblyParams().as_outfile_stats_contigtags,
1150
fn=createFileNameFromBasePostfixContigAndRead(
1151
CP_Pv[0].getAssemblyParams().as_outfile_stats_contigtags,
1154
assout::saveConsensusTagList(clist,fn,CP_mustdeletetargetfiles);
1155
}else if(*ttI=="crlist"){
1156
MIRAParameters::generateProjectOutNames(CP_Pv,CP_outbasename);
1158
if(CP_splitcontigs2singlefiles){
1159
fn=createFileNameFromBasePostfixContigAndRead(
1160
CP_Pv[0].getAssemblyParams().as_outfile_stats_crlist,
1164
fn=createFileNameFromBasePostfixContigAndRead(
1165
CP_Pv[0].getAssemblyParams().as_outfile_stats_crlist,
1168
assout::saveContigReadList(clist,fn,CP_mustdeletetargetfiles);
1169
}else if(*ttI=="asnp"){
1170
MIRAParameters::generateProjectOutNames(CP_Pv,CP_outbasename);
1173
if(CP_splitcontigs2singlefiles){
1174
fn=createFileNameFromBasePostfixContigAndRead(
1175
CP_Pv[0].getAssemblyParams().as_outfile_stats_snpanalysis,
1178
fa=createFileNameFromBasePostfixContigAndRead(
1179
CP_Pv[0].getAssemblyParams().as_outfile_stats_featureanalysis,
1182
fs=createFileNameFromBasePostfixContigAndRead(
1183
CP_Pv[0].getAssemblyParams().as_outfile_stats_featuresummary,
1186
fc=createFileNameFromBasePostfixContigAndRead(
1187
CP_Pv[0].getAssemblyParams().as_outfile_stats_featuresequences,
1191
fn=createFileNameFromBasePostfixContigAndRead(
1192
CP_Pv[0].getAssemblyParams().as_outfile_stats_snpanalysis,
1194
fa=createFileNameFromBasePostfixContigAndRead(
1195
CP_Pv[0].getAssemblyParams().as_outfile_stats_featureanalysis,
1197
fs=createFileNameFromBasePostfixContigAndRead(
1198
CP_Pv[0].getAssemblyParams().as_outfile_stats_featuresummary,
1200
fc=createFileNameFromBasePostfixContigAndRead(
1201
CP_Pv[0].getAssemblyParams().as_outfile_stats_featuresequences,
1205
assout::saveSNPList(clist,fn,CP_mustdeletetargetfiles);
1206
assout::saveFeatureAnalysis(clist,rp,
1208
CP_mustdeletetargetfiles);
1210
}else if(*ttI=="fcov"){
1211
MIRAParameters::generateProjectOutNames(CP_Pv,CP_outbasename);
1214
if(CP_splitcontigs2singlefiles){
1215
fn=createFileNameFromBasePostfixContigAndRead(
1216
CP_Pv[0].getAssemblyParams().as_outfile_stats_featurecoverage,
1221
fn=createFileNameFromBasePostfixContigAndRead(
1222
CP_Pv[0].getAssemblyParams().as_outfile_stats_featurecoverage,
1227
assout::saveCoverageInfo(clist,fn,CP_mustdeletetargetfiles);
1228
}else if(*ttI=="fasta"){
1229
//CALLGRIND_START_INSTRUMENTATION;
1231
if(CP_splitcontigs2singlefiles){
1232
bn=createFileNameFromBasePostfixContigAndRead(
1237
bn=createFileNameFromBasePostfixContigAndRead(
1241
assout::saveStrainsAsFASTAQ(clist,
1247
CP_mustdeletetargetfiles,
1248
CP_fillholesinstraingenomes);
1249
}else if(*ttI=="fastaqual"){
1250
// fastaqual is "do-nothing" as "fasta" also write fastaqual here!
1251
}else if(*ttI=="fastq"){
1252
//CALLGRIND_START_INSTRUMENTATION;
1254
if(CP_splitcontigs2singlefiles){
1255
bn=createFileNameFromBasePostfixContigAndRead(
1260
bn=createFileNameFromBasePostfixContigAndRead(
1264
assout::saveStrainsAsFASTAQ(clist,
1270
CP_mustdeletetargetfiles,
1271
CP_fillholesinstraingenomes);
1272
} else if(*ttI=="caf"){
1273
Contig::setCoutType(Contig::AS_CAF);
1274
list<Contig>::iterator I=clist.begin();
1275
for(;I!=clist.end(); I++){
1276
if(CP_recalcfeatureopt=='f') I->markFeaturesByConsensus(true,true,true);
1277
if(CP_recalcfeatureopt=='r') {
1279
Assembly::markRepeats(*I,dummy);
1281
bool mustclose=false;
1282
if(!(*ofsI)->is_open()){
1284
if(CP_splitcontigs2singlefiles){
1285
bn=createFileNameFromBasePostfixContigAndRead(
1290
bn=createFileNameFromBasePostfixContigAndRead(
1295
(*ofsI)->open(bn.c_str(), ios::out);
1303
} else if(*ttI=="sam"){
1304
BUGIFTHROW(!(*ofsI)->is_open(),"Ooops, SAM stream not open?");
1305
for(auto & cle : clist){
1306
cle.dumpAsSAM(*(*ofsI),CP_samcollect,true);
1308
} else if(*ttI=="samnbb"){
1309
BUGIFTHROW(!(*ofsI)->is_open(),"Ooops, SAM stream not open?");
1310
for(auto & cle : clist){
1311
cle.dumpAsSAM(*(*ofsI),CP_samcollect,false);
1313
} else if(*ttI=="maf"){
1314
Contig::setCoutType(Contig::AS_MAF);
1315
list<Contig>::iterator I=clist.begin();
1316
for(;I!=clist.end(); I++){
1317
if(CP_recalcfeatureopt=='f') I->markFeaturesByConsensus(true,true,true);
1318
if(CP_recalcfeatureopt=='r') {
1320
Assembly::markRepeats(*I,dummy);
1322
if(!(*ofsI)->is_open()){
1324
if(CP_splitcontigs2singlefiles){
1325
bn=createFileNameFromBasePostfixContigAndRead(
1330
bn=createFileNameFromBasePostfixContigAndRead(
1335
(*ofsI)->open(bn.c_str(), ios::out);
1336
Contig::dumpMAF_Head(*(*ofsI));
1339
if(CP_splitcontigs2singlefiles){
1343
} else if(*ttI=="html"){
1344
//cerr << "HTML output currently deactivated in development version!\n";
1347
if(CP_splitcontigs2singlefiles){
1348
bn=createFileNameFromBasePostfixContigAndRead(
1353
bn=createFileNameFromBasePostfixContigAndRead(
1357
assout::dumpContigListAsHTML(clist, bn, CP_mustdeletetargetfiles, CP_outbasename);
1358
} else if(*ttI=="text"
1361
if(CP_splitcontigs2singlefiles){
1362
bn=createFileNameFromBasePostfixContigAndRead(
1367
bn=createFileNameFromBasePostfixContigAndRead(
1371
assout::saveAsTXT(clist, bn, CP_mustdeletetargetfiles);
1372
} else if(*ttI=="exp"){
1373
// outbasename is in this case a directory name
1375
if(CP_splitcontigs2singlefiles){
1376
bn=createFileNameFromBasePostfixContigAndRead(
1381
bn=createFileNameFromBasePostfixContigAndRead(
1385
assout::saveAsGAP4DA(clist,bn,CP_mustdeletetargetfiles);
1386
} else if(*ttI=="gbf"){
1387
// outbasename is in this case the basename name
1389
if(CP_splitcontigs2singlefiles){
1390
bn=createFileNameFromBasePostfixContigAndRead(
1395
bn=createFileNameFromBasePostfixContigAndRead(
1399
assout::saveStrainsAsGBF(clist,
1403
CP_fillholesinstraingenomes,
1404
CP_mustdeletetargetfiles);
1405
} else if(*ttI=="gff3"){
1406
// outbasename is in this case the basename name
1407
list<Contig>::iterator cI=clist.begin();
1408
for(; cI != clist.end(); ++cI){
1410
if(CP_splitcontigs2singlefiles){
1411
bn=createFileNameFromBasePostfixContigAndRead(
1415
if(CP_gffsave.is_open()){
1418
CP_gffsave.open(bn);
1420
bn=createFileNameFromBasePostfixContigAndRead(
1423
if(!CP_gffsave.is_open()){
1424
CP_gffsave.open(bn);
1427
CP_gffsave.acquireContig(*cI,rp);
1429
} else if(*ttI=="ace"){
1430
// outbasename is in this case the basename name
1432
if(CP_splitcontigs2singlefiles){
1433
bn=createFileNameFromBasePostfixContigAndRead(
1438
bn=createFileNameFromBasePostfixContigAndRead(
1442
assout::saveAsACE(clist,bn,CP_mustdeletetargetfiles);
1443
} else if(*ttI=="tcs"){
1444
// outbasename is in this case the basename name
1446
if(CP_splitcontigs2singlefiles){
1447
bn=createFileNameFromBasePostfixContigAndRead(
1452
bn=createFileNameFromBasePostfixContigAndRead(
1456
assout::saveAsTCS(clist,bn,CP_mustdeletetargetfiles);
1457
} else if(*ttI=="wiggle" || *ttI=="wig"){
1458
// outbasename is in this case the basename name
1460
if(CP_splitcontigs2singlefiles){
1461
bn=createFileNameFromBasePostfixContigAndRead(
1466
bn=createFileNameFromBasePostfixContigAndRead(
1470
assout::saveAsWiggle(clist,bn,CP_mustdeletetargetfiles,false);
1471
} else if(*ttI=="gcwiggle" || *ttI=="gcwig"){
1472
// outbasename is in this case the basename name
1474
if(CP_splitcontigs2singlefiles){
1475
bn=createFileNameFromBasePostfixContigAndRead(
1480
bn=createFileNameFromBasePostfixContigAndRead(
1484
assout::saveAsWiggle(clist,bn,CP_mustdeletetargetfiles,true);
1486
cerr << "\n\n-t " << *ttI << " is not a valid 'to' type when converting contigs (sorry). But maybe something went wrong, please contact the author.\n";
1491
if(!CP_splitcontigs2singlefiles){
1492
CP_mustdeletetargetfiles=false;
1498
void ConvPro::saveContigList(list<Contig> & clist, ReadPool & rp)
1500
FUNCSTART("void ConvPro::saveContigList(list<Contig> & clist, ReadPool & rp)");
1501
bool dosomeoutput=false;
1503
list<Contig>::iterator cI=clist.begin();
1504
for(; cI != clist.end(); cI++){
1507
if(CP_mincontiglength>0
1508
&& cI->getContigLength() < CP_mincontiglength){
1511
Contig::constats_t constats(cI->getStats());
1515
if(CP_mincontigcoverage>0
1516
&& constats.avg_coverage < CP_mincontigcoverage){
1518
} else if(CP_minnumreads>0
1519
&& constats.total_reads < CP_minnumreads){
1525
if(General::hasNames()){
1526
string cname(cI->getContigName());
1527
if(!General::checkNamePresence(cname)){
1530
if(!CP_keepnamesfromfile) conout=!conout;
1534
// delete contigs which should not be output
1536
// would generally be better to have that in some loading callback (and
1537
// would work for contigs well enough), but readpool mechanisms would
1538
// not at the moment, too primitive
1541
if(cI != clist.begin()) --cI;
1544
dosomeoutput|=conout;
1548
for(cI=clist.begin(); cI != clist.end(); cI++){
1549
if(CP_deletestaronlycolumns) {
1550
cI->deleteStarOnlyColumns(0,cI->getContigLength());
1557
Assembly::refreshContigAndReadpoolValuesAfterLoading(rp,clist);
1559
// TODO: ! make autoconfigure: on several strains, this is needed!
1560
// else, let user define via switch
1561
for(cI=clist.begin(); cI != clist.end(); cI++){
1562
if(CP_recalcconopt=='c'
1563
|| CP_recalcconopt=='C'){
1564
cI->trashConsensusCache(false);
1566
if(CP_recalcconopt=='q'){
1567
cI->trashConsensusCache(true);
1570
CP_assemblyinfo.storeContigStats(cI->getStats());
1574
saveContigList_helper(CP_clist, rp);
1577
n.handleError(THISFUNC);
1582
void ConvPro::saveReadPool(ReadPool & rp, list<ofstream *> & ofs)
1584
FUNCSTART("void ConvPro::saveReadPool(ReadPool & rp, list<ofstream *> & ofs)");
1586
rp.adjustIllegalQualities(30);
1588
if(CP_deletestaronlycolumns) {
1589
for(uint32 i=0; i<rp.size(); i++) {
1590
rp.getRead(i).removeGapsFromRead();
1594
for(uint32 i=0; i<rp.size(); i++) {
1595
rp.getRead(i).blindSeqData('c');
1598
if(General::hasNames()){
1599
for(uint32 i=0; i<rp.size(); i++) {
1600
string rname(rp[i].getName());
1601
if((!General::checkNamePresence(rname) && CP_keepnamesfromfile)
1602
|| (General::checkNamePresence(rname) && !CP_keepnamesfromfile)){
1608
for(uint32 i=0; i<rp.size(); ++i) {
1609
rp[i].performHardTrim();
1612
if(CP_mincontiglength>0 && !CP_filter2readgroup){
1613
discardShortReads(CP_Pv,rp,CP_mincontiglength,CP_minlengthisclipped);
1616
if(!CP_renamesequences.empty()){
1617
if(!CP_renamenamescheme.empty()){
1618
for(size_t i=1; i<ReadGroupLib::getNumReadGroups(); ++i){
1619
auto rgid=ReadGroupLib::getReadGroupID(i);
1620
if(!rgid.setSegmentNaming(CP_renamenamescheme)){
1621
BUGIFTHROW(true,"Naming scheme '" << CP_renamenamescheme << "' is unknown.");
1624
for(uint32 i=0; i<rp.size(); ++i){
1625
rp[i].setTemplate("");
1626
rp[i].setTemplateSegment(0);
1627
rp[i].calcTemplateInfo();
1629
rp.makeTemplateIDs();
1632
vector<uint8> readrenamed(rp.size(),0);
1634
for(uint32 rpi=0; rpi<rp.size(); ++rpi){
1635
if(rp[rpi].getTemplatePartnerID()>=0 && readrenamed[rp[rpi].getTemplatePartnerID()]){
1636
tmpname=rp[rp[rpi].getTemplatePartnerID()].getTemplate();
1638
tmpname=CP_renamesequences+"_"+boost::lexical_cast<std::string>(CP_readrenamecounter++);
1640
auto segment=rp[rpi].getTemplateSegment();
1641
if(rp[rpi].getReadNamingScheme()==ReadGroupLib::SCHEME_SOLEXA){
1642
if(rp[rpi].getTemplateSegment()==1){
1644
}else if(rp[rpi].getTemplateSegment()==255){
1646
}else if(rp[rpi].getTemplateSegment()!=0){
1647
BUGIFTHROW(true,"Read " << rp[rpi].getName() << " has illegal segment " << static_cast<uint16>(rp[rpi].getTemplateSegment()) << " ???");
1650
rp[rpi].setTemplate("");
1651
rp[rpi].setTemplateSegment(0);
1652
rp[rpi].setName(tmpname);
1657
if(CP_makecontigs) {
1658
putReadsInContigsAndSave(CP_Pv, rp);
1659
}else if(CP_filter2readgroup){
1660
filterToReadGroup(rp);
1662
list<string>::iterator ttI= CP_totype.begin();
1663
list<ofstream *>::iterator ofsI= ofs.begin();
1664
for(; ttI!=CP_totype.end(); ++ttI, ++ofsI){
1665
bool needofsIclose=false;
1666
// BaCh 16.01.2012: changed != below to == ... was probably a simple programming error (I hope)
1668
if(!CP_splitcontigs2singlefiles){
1669
BUGIFTHROW(!(*(*ofsI)).is_open(), *ttI << " file stream not open???");
1671
(*(*ofsI)).open((rp[0].getName()+'.'+*ttI).c_str(),ios::out);
1675
// double indirection because iterator needs one and it is a list of ofstream pointers ...
1676
rp.dumpAs(*(*ofsI),Read::AS_FASTA,false);
1677
} else if(*ttI=="fastaqual"){
1678
rp.dumpAs(*(*ofsI),Read::AS_FASTAQUAL,false);
1679
} else if(*ttI=="maskedfasta"){
1680
rp.dumpAs(*(*ofsI),Read::AS_MASKEDMASKFASTA,false);
1681
} else if(*ttI=="maskedfastaqual"){
1682
rp.dumpAs(*(*ofsI),Read::AS_MASKEDMASKFASTAQUAL,false);
1683
} else if(*ttI=="fastq"){
1684
rp.dumpAs(*(*ofsI),Read::AS_FASTQ,false);
1685
} else if(*ttI=="caf" || *ttI=="scaf" ){
1686
rp.dumpAs(*(*ofsI),Read::AS_CAF,false);
1687
} else if(*ttI=="maf"){
1688
//rp.dumpAs(*(*ofsI),Read::AS_MAF,false);
1689
rp.saveAsMAF(*(*ofsI),false);
1690
} else if(*ttI=="gff3"){
1691
for(uint32 rpi=0; rpi<rp.size(); ++rpi){
1692
bool mustclose=false;
1693
if(!CP_gffsave.is_open()){
1695
CP_gffsave.open(rp[rpi].getName());
1697
CP_gffsave.acquireRead(rp[rpi]);
1698
if(mustclose) CP_gffsave.close();
1702
cerr << "\n\n-t " << *ttI << " is not a valid type for saving a readpool (internal)!\n";
1715
uint32 ConvPro::openOFSlist(Contig * optcontig, list<ofstream *> & ofs)
1717
FUNCSTART("uint32 ConvPro::openOFSlist(Contig * optcontig, list<ofstream *> & ofs)");
1718
BUGIFTHROW(CP_totype.empty(), " CP_totype.empty() ???");
1723
for(list<string>::iterator ttI= CP_totype.begin(); ttI!=CP_totype.end(); ++ttI){
1725
cout << "opening " << *ttI << endl;
1727
ofstmp=new ofstream;
1728
ofs.push_back(ofstmp);
1730
ofs.back()->open(createFileNameFromBasePostfixContigAndRead(CP_outbasename,".fasta",optcontig).c_str(), ios::out);
1732
} else if(*ttI=="fastaqual"){
1733
ofs.back()->open(createFileNameFromBasePostfixContigAndRead(CP_outbasename,".fasta.qual",optcontig).c_str(), ios::out);
1735
} else if(*ttI=="maskedfasta"){
1736
ofs.back()->open(createFileNameFromBasePostfixContigAndRead(CP_outbasename,".fasta",optcontig).c_str(), ios::out);
1738
} else if(*ttI=="maskedfastaqual"){
1739
ofs.back()->open(createFileNameFromBasePostfixContigAndRead(CP_outbasename,".fasta.qual",optcontig).c_str(), ios::out);
1741
} else if(*ttI=="fastq"){
1742
ofs.back()->open(createFileNameFromBasePostfixContigAndRead(CP_outbasename,".fastq",optcontig).c_str(), ios::out);
1744
} else if(*ttI=="caf" || *ttI=="scaf" ){
1745
ofs.back()->open(createFileNameFromBasePostfixContigAndRead(CP_outbasename,".caf",optcontig).c_str(), ios::out);
1747
} else if(*ttI=="maf"){
1748
ofs.back()->open(createFileNameFromBasePostfixContigAndRead(CP_outbasename,".maf",optcontig).c_str(), ios::out);
1749
Contig::dumpMAF_Head(*(ofs.back()));
1751
} else if(*ttI=="sam" || *ttI=="samnbb"){
1752
ofs.back()->open(createFileNameFromBasePostfixContigAndRead(CP_outbasename,".sam",optcontig).c_str(), ios::out);
1754
} else if(*ttI=="gff3"){
1755
if(CP_gffsave.is_open()) CP_gffsave.close();
1756
CP_gffsave.open(createFileNameFromBasePostfixContigAndRead(CP_outbasename,".gff3",optcontig).c_str());
1762
void ConvPro::closeOFSList(uint32 howmany, list<ofstream *> & ofs)
1764
FUNCSTART("uint32 ConvPro::closeOFSList(uint32 howmany)");
1765
BUGIFTHROW(howmany>ofs.size(),"howmany>ofs.size() ???");
1766
for(uint32 i=0; i<howmany; ++i){
1774
void ConvPro::cafmafload_callback(list<Contig> & clist, ReadPool & rp)
1776
FUNCSTART("void ConvPro::cafmafload_callback(list<Contig> & clist, ReadPool & rp)");
1777
BUGIFTHROW(clist.empty() && rp.size()==0,"clist.empty() && rp.size()==0");
1779
list<Contig>::iterator cI=clist.begin();
1780
for(; cI != clist.end(); ++cI){
1781
if(!CP_renamesequences.empty()){
1782
cI->setContigName("");
1783
cI->setContigNamePrefix(CP_renamesequences);
1787
if(!clist.empty() && !CP_extractreadsinsteadcontigs){
1788
saveContigList(clist,rp);
1790
list<ofstream *> ofs;
1792
if(CP_splitcontigs2singlefiles && !clist.empty()){
1793
mustclose=openOFSlist(&clist.front(),ofs);
1794
saveReadPool(rp,ofs);
1796
saveReadPool(rp,CP_ofs);
1798
closeOFSList(mustclose,ofs);
1801
Read::trashReadNameContainer();
1806
void ConvPro::readpoolload_callback(ReadPool & rp)
1808
saveReadPool(rp,CP_ofs);
1810
Read::trashReadNameContainer();
1815
void ConvPro::closeOpenStreams(list<ofstream *> & ofsl)
1817
list<ofstream *>::iterator ofsI= ofsl.begin();
1818
for(; ofsI!=ofsl.end(); ++ofsI){
1823
int ConvPro::main2(int argc, char ** argv)
1825
//CALLGRIND_STOP_INSTRUMENTATION;
1827
FUNCSTART("int main2(int argc, char ** argv)");
1832
extern char *optarg;
1835
//base_quality_t fqqualoffset=0;
1836
string fqqualoffset("33");
1838
string strainfile="";
1841
char endgap_fillchar=' ';
1845
splitFullPathAndFileName(argv[0],path,convertprog);
1848
boost::to_lower(convertprog);
1850
string::size_type seppos=string::npos;
1851
seppos=convertprog.find_first_of(sep,0);
1852
if(seppos!=string::npos){
1853
CP_fromtype=convertprog.substr(0, seppos);
1854
CP_totype.push_back(convertprog.substr(seppos+1,100));
1860
//"CZihumMsl:r:c:f:t:s:q:n:N:v:x:X:y:z:o:a:"
1861
const char pstring[]=
1863
"A:c:f:l:n:N:o:q:Q:r:R:S:t:x:X:y:z:";
1866
c = getopt(argc, argv, pstring);
1871
cout << MIRAVERSION << endl;
1881
CP_mustdeletetargetfiles=false;
1897
cerr << "ERROR: -c must be a single character\n";
1900
endgap_fillchar=egfc[0];
1908
CP_deletestaronlycolumns=true;
1916
CP_filter2readgroup=true;
1920
CP_keepnamesfromfile=false;
1924
linelen=atoi(optarg);
1928
cerr << "ERROR: -l must be >=0\n";
1934
CP_makecontigs=true;
1935
CP_extractreadsinsteadcontigs=false;
1939
CP_extractreadsinsteadcontigs=true;
1940
CP_makecontigs=false;
1953
fqqualoffset=optarg;
1957
CP_minqual=atoi(optarg);
1958
if(CP_minqual >100) {
1961
cerr << "ERROR: -q must be <= 100\n";
1967
auto tmpv=atoi(optarg);
1968
if(tmpv<0 || tmpv > 100) {
1971
cerr << "ERROR: -Q must be 0 <= value <= 100\n";
1974
CP_defaultqual=tmpv;
1975
CP_needsquality=false;
1980
for(size_t si=0; si<rrr.size(); si++){
1985
CP_recalcconopt=rrr[si];
1990
CP_recalcfeatureopt=rrr[si];
1994
cerr << "ERROR: -r must be one of c, C, q, f, r\n";
2003
CP_renamesequences=optarg;
2007
CP_splitcontigs2singlefiles=true;
2011
CP_renamenamescheme=optarg;
2015
// CP_minbasecoverage=atoi(optarg);
2019
CP_totype.push_back(optarg);
2023
CP_fillholesinstraingenomes=true;
2027
CP_mincontiglength=atoi(optarg);
2031
CP_mincontiglength=atoi(optarg);
2032
CP_minlengthisclipped=true;
2036
CP_mincontigcoverage=atoi(optarg);
2040
CP_minnumreads=atoi(optarg);
2044
CP_specialtestcode=true;
2048
cout << "Oooops? Known but unhandled option " << c << " ?\n";
2054
if(argc-optind < 1) {
2057
cerr << argv[0] << ": " << "Missing infile and out-basename as arguments!\n";
2061
if(argc-optind < 2) {
2064
cerr << argv[0] << ": " << "Missing either infile or out-basename as arguments!\n";
2068
CP_infile=argv[optind++];
2069
CP_outbasename=argv[optind++];
2071
if(CP_infile=="--help"){
2076
guessFromAndToType(CP_infile, CP_fromtype, nullptr,
2077
CP_outbasename, CP_totype, &CP_outbasename);
2079
// // comfort function: parse fromtype and totype from name of infile, outfile
2080
// // TODO: merge with same in mirabait
2081
// if(CP_fromtype.empty()){
2084
// string dummystem;
2085
// guessFileAndZipType(CP_infile,dummystem,ft,ziptype);
2086
// if(!ft.empty()) CP_fromtype.swap(ft);
2091
// guessFileAndZipType(CP_outbasename,CP_outbasename,ft,ziptype);
2092
// if(!ft.empty()) {
2093
// CP_totype.push_back(ft);
2094
//// string sep=".";
2096
//// auto seppos=CP_outbasename.rfind(sep);
2097
//// if(seppos!=string::npos){
2098
//// CP_outbasename.resize(seppos);
2103
// anything additional on the command line is treated as an additional totype
2104
for(; optind < argc; ++optind){
2105
CP_totype.push_back(argv[optind]);
2108
if(CP_fromtype=="gbk" || CP_fromtype=="gbff"){
2114
CP_totype.erase(unique(CP_totype.begin(),CP_totype.end()),CP_totype.end());
2116
checkTypes(CP_fromtype,CP_totype);
2118
MIRAParameters::setupStdMIRAParameters(CP_Pv);
2119
if(!miraparams.empty()){
2120
cout << "Parsing special MIRA parameters: " << miraparams << endl;
2121
MIRAParameters::parse(miraparams.c_str(),CP_Pv);
2125
const_cast<contig_parameters &>(CP_Pv[0].getContigParams()).con_output_text_cpl=linelen;
2126
const_cast<contig_parameters &>(CP_Pv[0].getContigParams()).con_output_html_cpl=linelen;
2127
const_cast<contig_parameters &>(CP_Pv[0].getContigParams()).con_output_text_gapfill=endgap_fillchar;
2128
const_cast<contig_parameters &>(CP_Pv[0].getContigParams()).con_output_html_gapfill=endgap_fillchar;
2130
ReadPool thepool(&CP_Pv);
2131
thepool.setMissingFASTAQualFileResolveMsg("use -Q");
2133
CP_assemblyinfo.setLargeContigSize(CP_mincontiglength);
2134
CP_assemblyinfo.setLargeTotalCov(CP_mincontigcoverage);
2136
if(CP_recalcconopt=='C'){
2137
for(uint32 i=0; i< CP_Pv.size(); i++){
2138
CP_Pv[i].setContigForceNonIUPAC(true,true);
2142
if(!CP_namefile.empty()){
2143
General::makeSelectionStringSet(CP_namefile);
2146
cout << "Loading from " << CP_fromtype << ", saving to:";
2148
for(list<string>::iterator ttI= CP_totype.begin(); ttI!=CP_totype.end(); ++ttI){
2149
ofstmp=new ofstream;
2150
CP_ofs.push_back(ofstmp);
2151
cout << ' ' << *ttI;
2153
if(!CP_splitcontigs2singlefiles){
2154
CP_ofs.back()->open(createFileNameFromBasePostfixContigAndRead(CP_outbasename,".fasta").c_str(), ios::out);
2156
CP_totype.push_back("fastaqual");
2157
} else if(*ttI=="fastaqual"){
2158
if(!CP_splitcontigs2singlefiles){
2159
CP_ofs.back()->open(createFileNameFromBasePostfixContigAndRead(CP_outbasename,".fasta.qual").c_str(), ios::out);
2161
} else if(*ttI=="maskedfasta"){
2162
if(!CP_splitcontigs2singlefiles){
2163
CP_ofs.back()->open(createFileNameFromBasePostfixContigAndRead(CP_outbasename,".fasta").c_str(), ios::out);
2165
CP_totype.push_back("maskedfastaqual");
2166
} else if(*ttI=="maskedfastaqual"){
2167
if(!CP_splitcontigs2singlefiles){
2168
CP_ofs.back()->open(createFileNameFromBasePostfixContigAndRead(CP_outbasename,".fasta.qual").c_str(), ios::out);
2170
} else if(*ttI=="fastq"){
2171
if(!CP_splitcontigs2singlefiles){
2172
CP_ofs.back()->open(createFileNameFromBasePostfixContigAndRead(CP_outbasename,".fastq").c_str(), ios::out);
2174
} else if(*ttI=="caf" || *ttI=="scaf" ){
2175
if(!CP_splitcontigs2singlefiles){
2176
CP_ofs.back()->open(createFileNameFromBasePostfixContigAndRead(CP_outbasename,".caf").c_str(), ios::out);
2178
} else if(*ttI=="maf"){
2179
if(!CP_splitcontigs2singlefiles){
2180
CP_ofs.back()->open(createFileNameFromBasePostfixContigAndRead(CP_outbasename,".maf").c_str(), ios::out);
2181
Contig::dumpMAF_Head(*(CP_ofs.back()));
2183
} else if(*ttI=="sam" || *ttI=="samnbb"){
2184
if(!CP_splitcontigs2singlefiles){
2185
CP_ofs.back()->open(createFileNameFromBasePostfixContigAndRead(CP_outbasename,".sam").c_str(), ios::out);
2187
} else if(*ttI=="gff3"){
2188
if(!CP_splitcontigs2singlefiles){
2189
CP_gffsave.open(createFileNameFromBasePostfixContigAndRead(CP_outbasename,"").c_str());
2191
} else if(*ttI=="hsnp"){
2192
} else if(*ttI=="asnp"){
2193
} else if(*ttI=="fcov"){
2194
} else if(*ttI=="cstats"){
2195
} else if(*ttI=="crlist"){
2196
} else if(*ttI=="html"){
2197
} else if(*ttI=="text"){
2198
} else if(*ttI=="txt"){
2199
} else if(*ttI=="exp"){
2200
} else if(*ttI=="gbf"){
2201
} else if(*ttI=="gff3"){
2202
} else if(*ttI=="ace"){
2203
} else if(*ttI=="tcs"){
2204
} else if(*ttI=="wiggle" || *ttI=="wig"){
2205
} else if(*ttI=="gcwiggle" || *ttI=="gcwig"){
2206
} else if(*ttI=="null"){
2208
BUGIFTHROW(true,"should never arrive here!");
2213
auto cpofsI=CP_ofs.begin();
2214
for(list<string>::iterator ttI= CP_totype.begin(); ttI!=CP_totype.end(); ++ttI, ++cpofsI){
2215
if(*ttI=="sam" || *ttI=="samnbb"){
2216
if(CP_fromtype=="maf"){
2217
cout << "Collecting basic SAM info from MAF file" << endl;
2218
CP_samcollect.processMAF(CP_infile);
2219
CP_samcollect.createSAMHeader();
2220
*(*cpofsI) << CP_samcollect.SAMC_headerstring;
2221
ReadGroupLib::discard();
2224
cerr << "\n\ncan only convert MAF to SAM for the time being, sorry\n";
2231
if(CP_fromtype=="caf" || CP_fromtype=="maf") {
2232
void (*usecallback)(list<Contig> &, ReadPool &) = cafmafload_callback;
2234
usecallback=nullptr;
2237
if(CP_fromtype=="caf") {
2238
CAF tcaf(&thepool, &CP_clist, &CP_Pv);
2239
vector<uint32> dummy;
2240
tcaf.load(CP_infile.c_str(),
2241
ReadGroupLib::SEQTYPE_SANGER,
2247
}else if(CP_fromtype=="maf") {
2248
MAFParse mafp(&thepool, &CP_clist, &CP_Pv);
2249
vector<uint32> dummy;
2250
mafp.load(CP_infile.c_str(),
2251
ReadGroupLib::SEQTYPE_SANGER,
2259
if(!CP_clist.empty() && usecallback==nullptr){
2260
sortContigsByName(CP_clist);
2261
cafmafload_callback(CP_clist,thepool);
2262
}else if(thepool.size()!=0){
2263
if(CP_sortbyname) sortPoolByName(thepool,CP_namefile);
2264
cafmafload_callback(CP_clist,thepool);
2267
if(CP_fromtype=="fasta"
2268
|| CP_fromtype=="fastq"
2269
|| CP_fromtype=="gbf"
2270
|| CP_fromtype=="gff3"){
2271
cout << "Loading data from " << CP_fromtype << " ...";
2274
if(CP_fromtype=="fasta"){
2275
fn2=CP_infile+".qual";
2276
}else if(CP_fromtype=="fastq"){
2280
ReadGroupLib::ReadGroupID rgid=ReadGroupLib::newReadGroup();
2281
rgid.setSequencingType(ReadGroupLib::SEQTYPE_TEXT);
2282
rgid.setDefaultQual(CP_defaultqual);
2284
string loadtype(CP_fromtype);
2285
if(loadtype=="fasta" && !CP_needsquality){
2286
loadtype="fastanoqual";
2289
bool canusecallback=true;
2291
canusecallback=false;
2294
if(!fqqualoffset.empty()){
2295
dummy=atoi(fqqualoffset.c_str());
2298
cout << "\nFor guessing the FASTQ offset, more RAM will be used as the full file needs to load into memory.";
2299
canusecallback=false;
2304
thepool.loadData_rgid(loadtype, CP_infile, fn2, rgid, false, readpoolload_callback);
2306
thepool.loadData_rgid(loadtype, CP_infile, fn2, rgid, false, nullptr);
2308
sortPoolByName(thepool,CP_namefile);
2310
readpoolload_callback(thepool);
2313
cerr << "\n\n-f " << CP_fromtype << " is not a valid from type! (simple pool)\n";
2321
// Need to close by hand as handleError() will perform a hard exit
2322
closeOpenStreams(CP_ofs);
2323
n.handleError("main");
2326
cerr << "Unexpected exception: Flow()\n";
2331
cerr << "Unknown exception caught, aborting the process.\n\nPlease contact: bach@chevreux.org\n\n";
2335
cout << "\nData conversion process finished, no obvious errors encountered.\n";
2349
static vector<MIRAParameters> MB_Pv;
2351
static string MB_fromtype;
2352
static list<string> MB_totype;
2354
static list<ofstream *> MB_ofs;
2357
static string MB_baitfile;
2358
static string MB_infile;
2359
static string MB_outbasename;
2361
static bool MB_deletestaronlycolumns;
2362
static bool MB_inversehit;
2363
static bool MB_fwdandrev;
2364
static uint32 MB_numbaithits;
2366
static bool MB_mustdeletetargetfiles;
2368
static list<Contig> MB_clist; // needed for CAF conversion (and GBF)
2370
static HashStatistics MB_hashstatistics;
2373
static void usage();
2374
static void checkTypes(string & fromtype,list<string> & totype);
2375
static void putReadsInContigsAndSave(vector<MIRAParameters> & Pv, ReadPool & rp);
2376
static void specialTestCode(list<Contig> & clist, ReadPool & rp);
2378
static void saveReadPool(ReadPool & rp);
2379
static void cafmafload_callback(list<Contig> & clist, ReadPool & rp);
2380
static void readpoolload_callback(ReadPool & rp);
2385
int main(int argc, char ** argv);
2389
vector<MIRAParameters> MiraBait::MB_Pv;
2391
string MiraBait::MB_fromtype;
2392
list<string> MiraBait::MB_totype;
2393
list<ofstream *> MiraBait::MB_ofs;
2395
string MiraBait::MB_infile;
2396
string MiraBait::MB_baitfile;
2397
string MiraBait::MB_outbasename;
2399
bool MiraBait::MB_deletestaronlycolumns=false;
2400
bool MiraBait::MB_mustdeletetargetfiles=true;
2401
bool MiraBait::MB_inversehit=false;
2402
bool MiraBait::MB_fwdandrev=true;
2403
uint32 MiraBait::MB_numbaithits=1;
2405
list<Contig> MiraBait::MB_clist; // needed for CAF conversion (and GBF)
2407
HashStatistics MiraBait::MB_hashstatistics;;
2410
MiraBait::~MiraBait()
2412
ConvPro::closeOpenStreams(MB_ofs);
2415
void MiraBait::usage()
2417
cout << "mirabait\t(MIRALIB version " << MIRALIBVERSION << ")\n";
2418
cout << "Author: Bastien Chevreux\t(bach@chevreux.org)\n\n";
2420
cout << "... baiting ...\n";
2422
//cout << "\tconvert_project [-f <fromtype>] [-t <totype>] [-s strainfile] [-q] infile outfile\n\n";
2423
cout << "mirabait [-f <fromtype>] [-t <totype> [-t <totype> ...]] [-iklor] baitfile infile <basename_for_outfile(s)>\n\n";
2424
cout << "Options:\n";
2425
cout << "\t-f <fromtype>\tload this type of project files, where fromtype is:\n"
2426
"\t caf\t\t sequences from CAF\n"
2427
"\t maf\t\t sequences from MAF\n"
2428
"\t phd\t\t sequences from a PHD\n"
2429
"\t gbf\t\t sequences from a GBF\n"
2430
"\t fasta\t sequences from a FASTA\n"
2431
"\t fastq\t sequences from a FASTQ\n";
2432
cout << "\t-t <totype>\twrite the sequences to this type (multiple mentions\n"
2433
"\t\t\tof -t are allowed):\n"
2434
"\t fasta\t sequences to FASTA\n"
2435
"\t fastq\t sequences to FASTQ\n"
2436
"\t caf\t\t sequences to CAF\n"
2437
"\t maf\t\t sequences to MAF\n"
2438
"\t txt\t\t sequence names to text file\n";
2441
"\t-k\t\tk-mer, length of bait in bases (<32, default=31)\n"
2442
"\t-n\t\tMin. number of k-mer baits needed (default=1)\n"
2443
"\t-i\t\tInverse hit: writes only sequences that do not hit bait\n"
2444
"\t-r\t\tNo checking of reverse complement direction\n";
2447
"\t-o\t\tfastq quality Offset (only for -f = 'fastq')\n"
2448
"\t\t\t Offset of quality values in FASTQ file. Default: 33\n"
2449
"\t\t\t A value of 0 tries to automatically recognise.\n";
2453
// cout << "\t-a <string>\tString with MIRA parameters to be parsed\n"
2454
// "\t\t\t Useful when setting parameters affecting consensus\n"
2455
// "\t\t\t calling like -CO:mrpg etc.\n"
2456
// "\t\t\t E.g.: -a \"454_SETTINGS -CO:mrpg=3\"\n";
2458
cout << "\nExamples:\n"
2464
void MiraBait::checkTypes(string & fromtype,list<string> & totype)
2466
if(fromtype.empty()){
2469
if(fromtype=="gbk" || fromtype=="gbff"){
2472
if(!(fromtype=="caf"
2477
|| fromtype=="fasta"
2478
|| fromtype=="fastq"
2482
cerr << "Unknown or illegal file type '" << fromtype << "' defined as <fromtype>\n";
2485
if(MB_totype.empty()){
2488
|| fromtype=="fasta"
2489
|| fromtype=="fastq"
2491
MB_totype.push_back(fromtype);
2493
MB_totype.push_back("fastq");
2496
for(list<string>::iterator ttI= MB_totype.begin(); ttI!=MB_totype.end(); ++ttI){
2497
if(*ttI=="scaf") *ttI="caf";
2506
cerr << "MiraBait::checkTypes(): Unknown or illegal file type '" << *ttI << "' defined as <totype>\n";
2512
// Note: clears the readpool after saving!
2513
void MiraBait::saveReadPool(ReadPool & rp)
2515
// first, bait all reads. Those who bite, discard.
2516
for(uint32 i=0; i<rp.size(); ++i){
2517
if((MB_hashstatistics.checkBaitHit(rp[i]) >= MB_numbaithits) ^ !MB_inversehit){
2522
// then save the read pool
2523
list<string>::iterator ttI= MB_totype.begin();
2524
list<ofstream *>::iterator ofsI= MB_ofs.begin();
2525
for(; ttI!=MB_totype.end(); ++ttI, ++ofsI){
2527
// double indirection because iterator needs one and it is a list of ofstream pointers ...
2528
rp.dumpAs(*(*ofsI),Read::AS_FASTA,false);
2529
} else if(*ttI=="fastaqual"){
2530
rp.dumpAs(*(*ofsI),Read::AS_FASTAQUAL,false);
2531
} else if(*ttI=="fastq"){
2532
rp.dumpAs(*(*ofsI),Read::AS_FASTQ,false);
2533
} else if(*ttI=="caf" || *ttI=="scaf" ){
2534
rp.dumpAs(*(*ofsI),Read::AS_CAF,false);
2535
} else if(*ttI=="maf"){
2536
rp.dumpAs(*(*ofsI),Read::AS_MAF,false);
2537
} else if(*ttI=="txt"){
2538
rp.dumpAs(*(*ofsI),Read::AS_READNAME,false);
2541
cerr << "\n\n-t " << *ttI << " is not a valid type when the source file does not contain a full assembly!\n";
2549
void MiraBait::cafmafload_callback(list<Contig> & clist, ReadPool & rp)
2551
// TODO: check if needed
2552
Assembly::refreshContigAndReadpoolValuesAfterLoading(rp,clist);
2556
Read::trashReadNameContainer();
2561
void MiraBait::readpoolload_callback(ReadPool & rp)
2563
// TODO: check if needed (slows loading by ~30 to 50%
2564
// rp.makeTemplateIDs(false);
2565
// rp.makeStrainIDs(false);
2569
Read::trashReadNameContainer();
2575
int MiraBait::main(int argc, char ** argv)
2577
//CALLGRIND_STOP_INSTRUMENTATION;
2579
FUNCSTART("int main(int argc, char ** argv)");
2584
extern char *optarg;
2588
string fqqualoffset="33";
2592
splitFullPathAndFileName(argv[0],path,convertprog);
2596
uint8 basesperhash=31;
2599
c = getopt(argc, argv, "hdirf:t:o:a:k:n:");
2612
MB_numbaithits=atoi(optarg);
2616
uint64 bla=atoi(optarg);
2622
MB_totype.push_back(optarg);
2626
fqqualoffset=optarg;
2630
MB_deletestaronlycolumns=true;
2650
if(argc-optind < 1) {
2651
cerr << argv[0] << ": " << "Missing baitfile, infile and out-basename as arguments!\n";
2656
if(argc-optind < 3) {
2657
cerr << argv[0] << ": " << "Missing one of baitfile, infile or out-basename as argument!\n";
2662
if(argc-optind > 3) {
2663
cerr << argv[0] << ": " << "Whoops, found more than baitfile, infile and out-basename as arguments left on the command line!\n";
2664
cerr << "Unparsed command line: ";
2665
for(;optind<argc;optind++) cerr <<argv[optind] << " ";
2671
MB_baitfile=argv[optind++];
2672
MB_infile=argv[optind++];
2673
MB_outbasename=argv[optind];
2675
if(MB_baitfile=="--help"){
2680
ConvPro::guessFromAndToType(MB_infile,MB_fromtype,nullptr,MB_outbasename,MB_totype,nullptr);
2682
if(MB_fromtype=="gbk" || MB_fromtype=="gbff"){
2686
checkTypes(MB_fromtype,MB_totype);
2688
MIRAParameters::setupStdMIRAParameters(MB_Pv);
2689
if(!miraparams.empty()){
2690
cout << "Parsing special MIRA parameters: " << miraparams << endl;
2691
MIRAParameters::parse(miraparams.c_str(),MB_Pv);
2696
ReadPool baitrp(&MB_Pv);
2697
cout << "Loading baits ...";
2698
//baitrp.loadDataFromFASTA(MB_baitfile,1, dummy, false,"",false);
2699
ReadGroupLib::ReadGroupID rgid=ReadGroupLib::newReadGroup();
2700
rgid.setSequencingType(ReadGroupLib::SEQTYPE_TEXT);
2701
baitrp.loadData_rgid("fastanoqual",MB_baitfile,MB_baitfile,rgid,false,nullptr);
2703
cout << "baitrp.size(): " << baitrp.size() << endl;
2706
MB_hashstatistics.prepareHashStatistics(".",baitrp,false,false,true,MB_fwdandrev,1,basesperhash,dummyfn);
2709
ReadPool loadrp(&MB_Pv);
2711
cout << "Loading from " << MB_fromtype << ", saving to:";
2713
for(list<string>::iterator ttI= MB_totype.begin(); ttI!=MB_totype.end(); ++ttI){
2714
cout << ' ' << *ttI;
2715
ofstmp=new ofstream;
2716
MB_ofs.push_back(ofstmp);
2718
MB_ofs.back()->open((MB_outbasename + ".fasta").c_str(), ios::out);
2719
} else if(*ttI=="fastq"){
2720
MB_ofs.back()->open((MB_outbasename + ".fastq").c_str(), ios::out);
2721
} else if(*ttI=="caf" || *ttI=="scaf" ){
2722
MB_ofs.back()->open((MB_outbasename + ".caf").c_str(), ios::out);
2723
} else if(*ttI=="maf"){
2724
MB_ofs.back()->open((MB_outbasename + ".maf").c_str(), ios::out);
2725
} else if(*ttI=="txt"){
2726
MB_ofs.back()->open((MB_outbasename + ".txt").c_str(), ios::out);
2729
cerr << "\n\n-t " << *ttI << " is not a valid type\n";
2737
if(MB_fromtype=="caf") {
2738
CAF tcaf(&loadrp, &MB_clist, &MB_Pv);
2739
vector<uint32> dummy;
2740
tcaf.load(MB_infile.c_str(),
2741
ReadGroupLib::SEQTYPE_SANGER,
2745
cafmafload_callback,
2748
}else if(MB_fromtype=="maf") {
2749
MAFParse mafp(&loadrp, &MB_clist, &MB_Pv);
2750
vector<uint32> dummy;
2751
mafp.load(MB_infile.c_str(),
2752
ReadGroupLib::SEQTYPE_SANGER,
2756
cafmafload_callback,
2762
if(MB_fromtype=="fasta"
2763
|| MB_fromtype=="fastq"
2764
|| MB_fromtype=="gbf"
2765
|| MB_fromtype=="gff3"){
2766
cout << "Loading data from " << MB_fromtype << " ...";
2769
if(MB_fromtype=="fasta"){
2770
fn2=MB_infile+".qual";
2771
}else if(MB_fromtype=="fastq"){
2774
string loadtype(MB_fromtype);
2775
if(loadtype=="fasta"){
2776
loadtype="fastanoqual";
2779
ReadGroupLib::ReadGroupID rgid=ReadGroupLib::newReadGroup();
2780
rgid.setSequencingType(ReadGroupLib::SEQTYPE_TEXT);
2781
loadrp.loadData_rgid(loadtype, MB_infile, fn2, rgid, false, readpoolload_callback);
2783
cerr << "\n\n-f " << MB_fromtype << " is not a valid from type!\n";
2791
// Need to close by hand as handleError() will perform a hard exit
2792
ConvPro::closeOpenStreams(MB_ofs);
2793
n.handleError("main");
2796
cerr << "Unexpected exception: Flow()\n";
2799
cerr << "Unknown exception caught, aborting the process.\n\nPlease contact: bach@chevreux.org\n\n";
2803
cout << "\nBaiting process finished.\n";
2811
int main(int argc, char ** argv)
2813
//CALLGRIND_STOP_INSTRUMENTATION;
2815
FUNCSTART("int main(int argc, char ** argv)");
2818
MemORC::setChecking(true);
2823
splitFullPathAndFileName(argv[0],path,convertprog);
2825
boost::to_lower(convertprog);
2828
if(convertprog=="tagsnp"){
2831
}else if(convertprog=="mirabait"){
2836
cp.main2(argc, argv);
2840
n.handleError("main");
2843
cout << "INTERNAL ERROR: Unexpected exception: Flow()\n";
2846
catch(const std::bad_alloc & e){
2847
cout << "Out of memory detected, exception message is: ";
2848
cout << e.what() << endl;
2850
if(sizeof(size_t) == sizeof(int32)){
2851
cout << "\nYou are running a 32 bit executable. Please note that the maximum"
2852
"\ntheoretical memory a 32 bit programm can use (be it in Linux, Windows or"
2853
"\nother) is 4 GiB, in practice less: between 2.7 and 3.3 GiB. This is valid"
2854
"\neven if your machine has hundreds of GiB."
2855
"\nShould your machine have more that 4 GiB, use a 64 bit OS and a 64 bit"
2856
"\nversion of MIRA.";
2859
cout << "\n\nIf you have questions on why this happened, please send the last 1000"
2860
"\nlines of the output log (or better: the complete file) to the author"
2861
"\ntogether with a short summary of your assembly project.\n\n";
2865
catch(const ios_base::failure & e){
2866
cout << "Failure in IO stream detected, exception message is: "
2868
<< "\nWe perhaps ran out of disk space or hit a disk quota?\n";
2871
catch (exception& e)
2873
cout << "A 'standard' exception occured (that's NOT normal):\n" << e.what() << "\n\nIf the cause is not immediatly obvious, please contact: bach@chevreux.org\n\n";
2877
cout << "Unknown exception caught, aborting the process.\n\nPlease contact: bach@chevreux.org\n\n";
2881
Read::dumpStringContainerStats(cout);