2
* Written by Bastien Chevreux (BaCh)
4
* Copyright (C) 1997-2000 by the German Cancer Research Center (Deutsches
5
* Krebsforschungszentrum, DKFZ Heidelberg) and Bastien Chevreux
6
* Copyright (C) 2000 and later by Bastien Chevreux
10
* This program is free software; you can redistribute it and/or
11
* modify it under the terms of the GNU General Public License
12
* as published by the Free Software Foundation; either version 2
13
* of the License, or (at your option) any later version.
15
* This program is distributed in the hope that it will be useful,
16
* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
* GNU General Public License for more details.
20
* You should have received a copy of the GNU General Public License
21
* along with this program; if not, write to the
22
* Free Software Foundation, Inc.,
23
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
32
#include "modules/mod_memestim.H"
33
#include "modules/misc.H"
36
#include "mira/assembly.H"
43
void mme_askChar(const string & question, const string & possibilities, char & answer, const char defchar)
47
cout << question << ' ';
48
if(!possibilities.empty()){
50
std::copy(possibilities.begin(),
52
ostream_iterator<char> (cout, "/"));
56
cout << "[" << defchar << "] ";
61
// empty input, try to get defchar from possibilities if it exists
62
if(input.empty() && defchar!=0) input=defchar;
64
for(uint32 i=0; i<possibilities.size(); i++){
65
if(input[0] == possibilities[i]) {
72
cout << answer << endl;
75
void mme_askDoubleNP(const string & question, double & answer, const string & defd)
79
cout << question << ' ';
81
cout << "[" << defd << "] ";
86
// empty input, try to get def from possibilities if it exists
87
if(input.empty() && !defd.empty()) {
91
answer=strtod(input.c_str(),&pend);
94
// try to parse kilo, mega, giga
95
while(*pend != 0 && isspace(*pend)) pend++;
96
switch(toupper(*pend)){
111
cout << "Please only use k, m, g as modifiers.\n";
118
void mme_askDouble(const string & question, double & answer, const string & defd)
120
mme_askDoubleNP(question, answer, defd);
121
cout << answer << endl;
124
void mme_askInt(const string & question, int64 & answer, const string & defint)
127
mme_askDoubleNP(question, tmp, defint);
128
answer=static_cast<int64>(tmp);
129
cout << answer << endl;
133
void miraMemEstimate(int argc, char ** argv)
140
c = getopt(argc, argv, "v");
145
cout << MIRAVERSION << endl;
152
cout << "This is MIRA " MIRAVERSION ".\n\n";
154
cout << "Please cite: Chevreux, B., Wetter, T. and Suhai, S. (1999), Genome Sequence\nAssembly Using Trace Signals and Additional Sequence Information.\nComputer Science and Biology: Proceedings of the German Conference on\nBioinformatics (GCB) 99, pp. 45-56.\n\n";
157
cout << "\n\nmiraMEM helps you to estimate the memory needed to assemble a project.\n"
158
"Please answer the questions below.\n\n"
159
"Defaults are give in square brackets and chosen if you just press return.\n"
160
"Hint: you can add k/m/g modifiers to your numbers to say kilo, mega or giga.\n\n";
168
int64 num454gs20reads=0;
169
int64 num454flxreads=0;
170
int64 num454titaniumreads=0;
175
int64 largestcontigexpected=0;
178
int64 totalexpectedbases=0;
180
int64 readsinlargestcontig=0;
181
int64 readbasesinlargestcontig=0;
183
mme_askChar("Is it a genome or transcript (EST/tag/etc.) project?",
188
mme_askInt("Size of genome?",
192
cout << "A sequence size of less than 100 bases is pretty improbable.\n"
193
<< "Did you forget a modifier (k, m or g) to the number you gave?\n";
196
largestcontigexpected=seqsize;
197
if(largestcontigexpected>30*1000*1000){
198
cout << "Looks like a larger eukaryote, guessing largest chromosome size: 30m\nChange if needed!\n";
199
largestcontigexpected=30*1000*1000;
205
ostr << largestcontigexpected;
207
mme_askInt("Size of largest chromosome?",
208
largestcontigexpected,
212
mme_askChar("Is it a denovo or mapping assembly?",
219
mme_askInt("Number of Sanger reads?",
222
mme_askChar("Are there 454 reads?",
227
mme_askInt("Number of 454 GS20 reads?",
230
mme_askInt("Number of 454 FLX reads?",
233
mme_askInt("Number of 454 Titanium reads?",
237
mme_askChar("Are there PacBio reads?",
242
mme_askInt("Number of PacBio reads?",
245
mme_askInt("Average PacBio length?",
249
mme_askChar("Are there Solexa reads?",
254
mme_askInt("Number of Solexa reads?",
257
mme_askInt("Average Solexe length?",
262
totalexpectedbases=numsanreads*1000;
263
totalexpectedbases+=num454gs20reads*120;
264
totalexpectedbases+=num454flxreads*260;
265
totalexpectedbases+=num454titaniumreads*460;
266
totalexpectedbases+=numsxareads*avgsxalen;
267
totalexpectedbases+=numpbsreads*avgpbslen;
269
totalreads=numsanreads;
270
totalreads+=num454gs20reads;
271
totalreads+=num454flxreads;
272
totalreads+=num454titaniumreads;
273
totalreads+=numsxareads;
274
totalreads+=numpbsreads;
277
if(denovomapping=='d'){
278
readsinlargestcontig=totalreads/2;
279
readbasesinlargestcontig=totalexpectedbases/2;
281
largestcontigexpected=seqsize;
282
readsinlargestcontig=totalreads;
283
readbasesinlargestcontig=totalexpectedbases;
285
// if solexa is mapped, there are less reads due to
286
// coverage equivalent mapping and virtual long reads
287
// be conservative, reduce only by 50%
289
readsinlargestcontig-=numsxareads/2;
294
largestcontigexpected=seqsize;
295
readsinlargestcontig=50000;
296
readbasesinlargestcontig=readsinlargestcontig*1000; //10k reads times sanger length
299
// account for gaps with 454 reads
300
if(num454flxreads>0 || num454gs20reads>0){
301
largestcontigexpected+=largestcontigexpected/10;
302
readbasesinlargestcontig+=readbasesinlargestcontig/10;
305
//cout << "totalreads: " << totalreads
306
// << "\nreadsinlargestcontig: " << readsinlargestcontig
307
// << "\ntotalexpectedbases: " << totalexpectedbases
308
// << "\nreadbasesinlargestcontig: " << readbasesinlargestcontig
311
int64 livereads=totalreads+readsinlargestcontig;
312
int64 livebases=totalexpectedbases+readbasesinlargestcontig;
314
//cout << "livereads: " << livereads
315
// << "\nlivebases: " << livebases << endl;
317
double avgcov=static_cast<double>(totalexpectedbases/seqsize);
318
avgcov-=avgcov/8; // in general we have 12% loss of usable data
320
int64 numskimhits=static_cast<int64>(avgcov*850000); // estimate skim hits, very rough
324
// what do the reads need?
326
livereads*sizeof(Read) // class size
327
+livereads*200 // additional strings etc.
328
+livereads*4*sizeof(tag_t) // on average 4 tags per read
329
+livebases*8; // sequences, qualities, adjustments, base flags
331
// new: solexa reads don't have adjustments
332
// yeah, but estimate is already small enough, keep it
333
//memneeded-=(numsxareads*avgsxalen) * 2;
336
// what does a contig need?
337
// (note: the needs for the reads themselves are already
338
// accounted for in the section above)
340
//readsinlargestcontig*sizeof(Contig::contigread_t)
341
//+readsinlargestcontig*sizeof(Contig::out_order_t)
343
readsinlargestcontig*40 // 40 == rough guesstimate for PlacedContigReads
344
+totalreads*9 /* templates, mapping
346
+largestcontigexpected*sizeof(Contig::consensus_counts_t)
347
+largestcontigexpected*10; // adjustments and some reserve
349
int64 memforlargetables=0;
350
// some more overhead by the assembly class
351
memforlargetables+= totalreads*20;
353
// get the skim edges accounted
354
int64 skimhitsmem=numskimhits*2*sizeof(skimedges_t);
355
// since 2.9.40 there's the possibility to cap that memory
357
if(skimhitsmem>1024L*1024*1024){
358
skimhitsmem=2LL*1024*1024*1024;
359
if(numsxareads>0) skimhitsmem*=2;
362
// mem needed for temporary skim need
363
int64 tmpskim=500*1000*1000;
365
memforlargetables+=max(skimhitsmem,tmpskim);
367
// possible vector leftover clip
370
// AS_readhitmiss & AS_readhmcovered
371
memforpvlc=totalexpectedbases*8;
372
// overhead of the structures above
373
memforpvlc+=sizeof(vector<uint32>)*totalreads*2;
376
memforpvlc+=totalreads*4;
379
// ok, 1MB of additional small things
380
int64 memneededfordata=memneeded+(1024*1024);
382
// experience shows that not all has been accounted for
383
// and internal mem caching of memory allocators add another
384
// layer of RAM needs
386
// add 40% to estimates
387
// but not if whe have mapping with Solexas
388
if(denovomapping!='m' && numsxareads==0){
389
memneededfordata+=memneededfordata/100*40;
390
memforlargetables+=memforlargetables/100*40;
393
cout.setf(ios::fixed, ios::floatfield);
394
//cout.setf(ios::showpoint);
397
cout << "\n\n************************* Estimates *************************\n\n";
398
// last, if it's an EST assembly, there is no seqsize
401
cout << "EST assembly, cannot give coverage estimate. Also, estimates"
402
"\nmay be way off for pathological cases.\n";
405
cout << "The contigs will have an average coverage of ~ " << avgcov
407
"\nEstimates may be way off for pathological cases.\n";
410
cout << "\nRAM estimates:"
411
"\n" << setw(40) << "reads+contigs (unavoidable): ";
412
byteToHumanReadableSize(memneededfordata,cout);
413
cout << "\n" << setw(40) << "large tables (tunable): ";
414
byteToHumanReadableSize(memforlargetables,cout);
415
cout << "\n" << setw(40) << "" << "---------"
416
"\n" << setw(40) << "total (peak): ";
417
byteToHumanReadableSize(memforlargetables+memneededfordata,
419
cout << "\n\n" << setw(40) << "add if using -CL:pvlc=yes : ";
420
byteToHumanReadableSize(memforpvlc,cout);
421
if(denovomapping=='m' && numsxareads>0){
422
int64 notusingmerge=memneededfordata/100*40;
423
cout << "\n" << setw(40) << "add if setting -CO:msr=no : ";
424
byteToHumanReadableSize(notusingmerge,cout);
427
"Note that some algorithms might try to grab more memory if"
428
"\nthe need arises and the system has enough RAM. The options"
429
"\nfor automatic memory management control this:"
430
"\n -AS:amm, -AS:kpmf, -AS:mps"
431
"\nFurther switches that might reduce RAM (at cost of run time"
433
"\n -SK:mhim, -SK:mchr (both runtime); -SK:mhpr (accuracy)\n"
434
"*************************************************************\n";