~ubuntu-branches/ubuntu/trusty/mira/trusty-proposed

Viewing changes to src/mira/assembly_misc.C

Committer: Package Import Robot
Author(s): Thorsten Alteholz, Thorsten Alteholz, Andreas Tille
Date: 2014-02-02 22:51:35 UTC
mfrom: (7.1.1 sid)
Revision ID: package-import@ubuntu.com-20140202225135-nesemzj59jjgogh0

Tags: 4.0-1

http://bugs.debian.org/735798

[ Thorsten Alteholz ]
* New upstream version
* debian/rules: add boost dir in auto_configure (Closes: #735798)

[ Andreas Tille ]
* cme fix dpkg-control
* debian/patches/{make.patch,spelling.patch}: applied upstream (thus removed)

files added:
doc/docbook/attic/chap_454_part.xml

doc/docbook/attic/chap_iontor_part.xml

doc/docbook/attic/chap_pacbio_part.xml

doc/docbook/attic/chap_sanger_part.xml

doc/docbook/attic/chap_solexa_part.xml

doc/docbook/bookfigures/results_miraconvert.png

doc/docbook/chap_dataprep_part.xml

doc/docbook/chap_denovo_part.xml

doc/docbook/chap_mapping_part.xml

doc/docbook/chap_seqtechdesc_part.xml

doc/docbook/chap_specialparams_part.xml

doc/docbook/replace_all.sh

m4/ax_boost_base.m4

m4/ax_boost_filesystem.m4

m4/ax_boost_iostreams.m4

m4/ax_boost_regex.m4

m4/ax_boost_system.m4

m4/ax_boost_thread.m4

m4/ax_check_zlib.m4

m4/ax_cxx_have_std.m4

m4/ax_cxx_have_stl.m4

m4/ax_cxx_namespaces.m4

m4/ax_lib_expat.m4

m4/libtool.m4

m4/ltoptions.m4

m4/ltsugar.m4

m4/ltversion.m4

m4/lt~obsolete.m4

src/mira/warnings.C

src/mira/warnings.H

src/modules

src/modules/Makefile.am

src/modules/Makefile.in

src/modules/misc.C

src/modules/misc.H

src/modules/mod_bait.C

src/modules/mod_bait.H

src/modules/mod_convert.C

src/modules/mod_convert.H

src/modules/mod_dbgreplay.C

src/modules/mod_memestim.C

src/modules/mod_memestim.H

src/modules/mod_mira.C

src/modules/mod_mira.H

src/modules/mod_tagsnp.C

src/modules/mod_tagsnp.H

src/progs/quirks.C

src/util/fmttext.C

src/util/fmttext.H

files removed:
.pc/boost-minimal.patch

.pc/boost-minimal.patch/src

.pc/boost-minimal.patch/src/progs

.pc/boost-minimal.patch/src/progs/Makefile.in

config

config/m4

config/m4/ax_boost_base.m4

config/m4/ax_boost_filesystem.m4

config/m4/ax_boost_iostreams.m4

config/m4/ax_boost_regex.m4

config/m4/ax_boost_system.m4

config/m4/ax_boost_thread.m4

config/m4/ax_check_zlib.m4

config/m4/ax_cxx_have_std.m4

config/m4/ax_cxx_have_stl.m4

config/m4/ax_cxx_namespaces.m4

config/m4/ax_lib_expat.m4

config/m4/libtool.m4

config/m4/ltoptions.m4

config/m4/ltsugar.m4

config/m4/ltversion.m4

config/m4/lt~obsolete.m4

doc/docbook/bookfigures/ion_dh10bcovB13_12kb.png

doc/docbook/bookfigures/ion_dh10bcovB13_320kb.png

doc/docbook/bookfigures/results_convert_project.png

doc/docbook/chap_454_part.xml

doc/docbook/chap_iontor_part.xml

doc/docbook/chap_pacbio_part.xml

doc/docbook/chap_sanger_part.xml

doc/docbook/chap_solexa_part.xml

src/progs/convert_project.C

src/progs/dbgreplay.C

src/scripts/fastaselect.tcl

src/scripts/fastqselect.tcl

files modified:
.pc/applied-patches

.tarball-version

.version

HELP_WANTED

INSTALL

Makefile.am

Makefile.in

README

README_build.html

THANKS

aclocal.m4

build-aux/git-version-gen

configure

configure.ac

debian/changelog

debian/control

debian/mira-assembler.lintian-overrides

debian/patches/series

debian/rules

debian/watch

distribution/Makefile

distribution/Makefile.am

distribution/Makefile.in

distribution/README

doc/Makefile

doc/Makefile.in

doc/docbook/Makefile

doc/docbook/Makefile.am

doc/docbook/Makefile.in

doc/docbook/attic/README.txt

doc/docbook/book_3rdparty.xml

doc/docbook/book_definitiveguide.xml

doc/docbook/chap_3p_ghbambus_part.xml

doc/docbook/chap_bitsandpieces_part.xml

doc/docbook/chap_est_part.xml

doc/docbook/chap_faq_part.xml

doc/docbook/chap_hard_part.xml

doc/docbook/chap_installation_part.xml

doc/docbook/chap_intro_part.xml

doc/docbook/chap_logfiles_part.xml

doc/docbook/chap_maf_part.xml

doc/docbook/chap_mirautils_part.xml

doc/docbook/chap_preface_part.xml

doc/docbook/chap_reference_part.xml

doc/docbook/chap_results_part.xml

doc/docbook/chap_seqadvice_part.xml

doc/docbook/copyrightfile

doc/docbook/warning_frontofchapter.xml

minidemo/bbdemo2/runme.sh

minidemo/estdemo2/README

src/Makefile.am

src/Makefile.in

src/caf/Makefile.in

src/caf/caf.C

src/debuggersupport/Makefile.in

src/debuggersupport/dbgsupport.C

src/errorhandling/Makefile.in

src/errorhandling/errorhandling.C

src/errorhandling/errorhandling.H

src/io/Makefile.in

src/io/fasta.C

src/io/fasta.H

src/io/fastq-lh.H

src/io/gap4_ft_so_map.xxd

src/io/ncbiinfoxml.C

src/io/ncbiinfoxml.H

src/io/phd.H

src/io/scf.C

src/io/scf.H

src/memorc/Makefile.in

src/memorc/main.C

src/memorc/memorc.C

src/mira/CHANGES.txt

src/mira/Makefile.am

src/mira/Makefile.in

src/mira/TODO

src/mira/adaptorsforclip.454.xxd

src/mira/ads.C

src/mira/align.C

src/mira/align.H

src/mira/assembly.C

src/mira/assembly.H

src/mira/assembly_info.C

src/mira/assembly_info.H

src/mira/assembly_io.C

src/mira/assembly_misc.C

src/mira/assembly_output.C

src/mira/assembly_output.H

src/mira/assembly_reduceskimhits.C

src/mira/assembly_swalign.C

src/mira/contig.C

src/mira/contig.H

src/mira/contig_analysis.C

src/mira/contig_consensus.C

src/mira/contig_edit.C

src/mira/contig_featureinfo.C

src/mira/contig_output.C

src/mira/contig_pairconsistency.C

src/mira/dataprocessing.C

src/mira/dataprocessing.H

src/mira/dynamic.C

src/mira/dynamic.H

src/mira/gbf_parse.H

src/mira/gff_parse.C

src/mira/hashstats.C

src/mira/hashstats.H

src/mira/hdeque.H

src/mira/maf_parse.C

src/mira/manifest.C

src/mira/manifest.H

src/mira/parameters.C

src/mira/parameters.H

src/mira/parameters_flexer.cc

src/mira/parameters_flexer.ll

src/mira/parameters_tokens.h

src/mira/pcrcontainer.C

src/mira/pcrcontainer.H

src/mira/ppathfinder.C

src/mira/ppathfinder.H

src/mira/preventinitfiasco.C

src/mira/read.C

src/mira/read.H

src/mira/readgrouplib.C

src/mira/readgrouplib.H

src/mira/readpool.C

src/mira/readpool.H

src/mira/sam_collect.C

src/mira/skim.C

src/mira/skim.H

src/mira/skim_farc.C

src/mira/skim_lowbph.C

src/mira/structs.H

src/progs/Makefile.am

src/progs/Makefile.in

src/progs/mira.C

src/progs/miramer.C

src/progs/quirks.H

src/scripts/Makefile.am

src/scripts/Makefile.in

src/scripts/fasta2frag.tcl

src/stdinc/Makefile.in

src/stdinc/defines.H

src/stdinc/stlincludes.H

src/stdinc/types.H

src/support/GTAGDB

src/support/Makefile.in

src/util/Makefile.am

src/util/Makefile.in

src/util/dptools.H

src/util/fileanddisk.C

src/util/fileanddisk.H

src/util/misc.C

src/version.H

src/version.stub

Show diffs side-by-side

added added

removed removed

src/mira/assembly_misc.C

#include "boost/unordered_map.hpp"

#include <boost/regex.hpp>

#include <boost/filesystem.hpp>

#include "assembly.H"

#include "dataprocessing.H"

#include "hashstats.H"

#include <ctype.h>

#include <boost/algorithm/string.hpp>

#include <boost/lexical_cast.hpp>

#include "mira/assembly.H"

#include "mira/dataprocessing.H"

#include "mira/hashstats.H"

using namespace std;

122

124

true,

123

125

124

126

basesperhash,

127

hs_params.hs_million_hashes_per_buffer,

128

hs_params.hs_rare_kmer_early_kill,

125

129

filenameforhs

126

130

);

127

131

}

128

132

s3.showHashStatisticsInfo();

133

134

// very rough estimator of coverage for contigs

135

if(basesperhash>=15){

136

AS_hashstat_avghashfreq=s3.getAvgHashFreqRaw();

137

cout << "Estimator of average coverage: " << AS_hashstat_avghashfreq << endl;

138

AS_assemblyinfo.setLargeContigCovForStats(AS_hashstat_avghashfreq/3);

139

}

140

129

141

cout << "Assigning statistics values:\n";

130

142

if(AS_miraparams[0].getAssemblyParams().as_dateoutput) dateStamp(cout);

131

s3.assignReadBaseStatistics_MultiThread(skim_params.sk_numthreads, nastyrepeatratio>0,minkmer);

143

{

144

uint32 mincountkmerforks=2;

145

if(AS_miraparams[0].getPathfinderParams().paf_use_genomic_algorithms){

146

mincountkmerforks=AS_hashstat_avghashfreq/6; // /6 to conservative??

147

if(mincountkmerforks<2) mincountkmerforks=2;

148

}

149

s3.assignReadBaseStatistics_MultiThread(skim_params.sk_numthreads, nastyrepeatratio>0,minkmer,mincountkmerforks);

150

}

132

151

if(AS_miraparams[0].getAssemblyParams().as_dateoutput) dateStamp(cout);

133

152

134

153

if(AS_miraparams[0].getAssemblyParams().as_buntify_reads){

135

154

AS_dataprocessing.buntifyReadsByHashFreq_Pool(AS_readpool,basesperhash);

155

AS_dataprocessing.addKMerForkTags_Pool(AS_readpool,basesperhash);

136

156

}

137

157

}

138

158

229

249

cout << howmanys << " sequences with " << howmanyt << " masked stretches." << endl;

230

250

}

231

251

252

// quick check for estimated coverage in genome data

253

if(!AS_donequickdenovocoveragecheck){

254

// atm only for de-novo, think about doing it for mapping

255

// (though there may be good reasons for high coverage in mappings)

256

if(basesperhash>=17

257

&& warnAtHighCoverages(AS_hashstat_avghashfreq)

258

&& AS_miraparams[0].getNagAndWarnParams().nw_check_coverage==NWSTOP){

259

MIRANOTIFY(Notify::FATAL,"High average coverage detected, see output log above respectively the 'WARNING' files in the info directory for more information. In case you wish to force MIRA to disregard this safety check, consider using '-NW:cac=warn' or '-NW:cac=no'");

260

}

261

AS_donequickdenovocoveragecheck=true;

262

}

263

232

264

if(dorarekmermask) {

233

265

string tmpfname;

234

266

tmpfname=buildFileName(version,"","",

235

267

as_fixparams.as_tmpf_clippings,

236

".txt");

268

".txt","",false);

237

269

AS_dataprocessing.startLogging(tmpfname,true);

238

270

239

271

AS_dataprocessing.performRareKMERMasking_Pool(AS_readpool,"pre-assembly");

240

272

AS_dataprocessing.stopLogging();

241

273

}

242

274

275

if(0){

276

AS_dataprocessing.performKMERRepeatTagging_Pool(AS_readpool,basesperhash);

277

}

278

279

if(0){

280

Read::setCoutType(Read::AS_TEXT);

281

cout << "AFTER2\n";

282

for(uint32 actid=0; actid<AS_readpool.size(); ++actid){

283

Read & r=AS_readpool.getRead(actid);

284

cout << r;

285

}

286

exit(999);

287

}

288

243

289

if(hs_params.hs_masknastyrepeats && hs_params.hs_apply_digitalnormalisation){

244

290

if(!fileExists(stathsfn)){

245

291

//cout << "Renaming / moving static hash statistics " << filenameforhs << " to " << stathsfn << " ... "; cout.flush();

256

302

257

303

258

304

if(AS_logflag_dumphashanalysis){

259

string logfilename=AS_miraparams[0].getDirectoryParams().dir_tmp+"/elog.dp.hashanalysis.lst";

305

string logfilename=buildFileName(version, "", "",

306

"elog.dp.hashanalysis_pass",

307

".lst");

308

//string logfilename=AS_miraparams[0].getDirectoryParams().dir_tmp+"/elog.dp.hashanalysis.lst";

260

309

261

310

cout << "elog hashan: " << logfilename << endl;

262

311

ofstream logfout;

331

380

true,

332

381

AS_miraparams[0].getAssemblyParams().as_clip_pec_mkfr,

333

382

basesperhash,

383

hs_params.hs_million_hashes_per_buffer,

384

hs_params.hs_rare_kmer_early_kill,

334

385

filenameforhs

335

386

);

336

387

s3.showHashStatisticsInfo();

337

388

cout << "Assigning statistics values:\n";

338

389

if(AS_miraparams[0].getAssemblyParams().as_dateoutput) dateStamp(cout);

339

s3.assignReadBaseStatistics_MultiThread(skim_params.sk_numthreads, false, dummy);

390

s3.assignReadBaseStatistics_MultiThread(skim_params.sk_numthreads, false, dummy,0);

340

391

if(AS_miraparams[0].getAssemblyParams().as_dateoutput) dateStamp(cout);

341

392

342

393

350

401

const_cast<assembly_parameters &>(AS_miraparams[0].getAssemblyParams()).as_clip_pec_mkfr=2;

351

402

}

352

403

}

404

405

// quick check for estimated coverage in genome data

406

if(!AS_donequickdenovocoveragecheck){

407

// atm only for de-novo, think about doing it for mapping

408

// (though there may be good reasons for high coverage in mappings)

409

if(basesperhash>=17

410

&& warnAtHighCoverages(avgcov)

411

&& AS_miraparams[0].getNagAndWarnParams().nw_check_coverage==NWSTOP){

412

413

}

414

// Nope, do not set that here. The hash statistics of the main loop have the final word!

415

// AS_donequickdenovocoveragecheck=true;

416

}

417

353

418

}

354

419

355

420

ofstream logfout;

391

456

int32 lpos=r.getLeftClipoff();

392

457

vector<Read::bposhashstat_t>::const_iterator bhsI=r.getBPosHashStats().begin();

393

458

advance(bhsI,lpos);

394

for(; lpos<static_cast<int32>(r.getLenSeq()); ++lpos, ++bhsI) {

459

for(int32 lend=static_cast<int32>(r.getLenClippedSeq()); lpos<lend; ++lpos, ++bhsI) {

395

460

if(AS_miraparams[r.getSequencingType()].getAssemblyParams().as_clip_pec_ffreq >0

396

461

&& (bhsI->fwd.getFrequency() > AS_miraparams[r.getSequencingType()].getAssemblyParams().as_clip_pec_ffreq

397

462

|| bhsI->rev.getFrequency() > AS_miraparams[r.getSequencingType()].getAssemblyParams().as_clip_pec_ffreq)) {

443

508

vector<Read::bposhashstat_t>::const_iterator bhsI=r.getBPosHashStats().begin();

444

509

advance(bhsI,rpos);

445

510

446

for(; rpos >0; --rpos){

511

for(int32 rend=r.getLeftClipoff(); rpos >rend; --rpos){

447

512

--bhsI;

448

513

if(AS_miraparams[r.getSequencingType()].getAssemblyParams().as_clip_pec_bfreq

449

514

&& (bhsI->fwd.getFrequency() > AS_miraparams[r.getSequencingType()].getAssemblyParams().as_clip_pec_bfreq

1469

1534

1470

1535

1471

1536

1472

1473

1474

1475

/////////////////////////////////////////////////////////////////////////

1476

/////////////////////////////////////////////////////////////////////////

1477

/////////////////////////////////////////////////////////////////////////

1478

/////////////////////////////////////////////////////////////////////////

1479

/////////////////////////////////////////////////////////////////////////

1480

/////////////////////////////////////////////////////////////////////////

1481

///////////////////////// Obsolete ///////////////////////

1482

/////////////////////////////////////////////////////////////////////////

1483

/////////////////////////////////////////////////////////////////////////

1484

/////////////////////////////////////////////////////////////////////////

1485

/////////////////////////////////////////////////////////////////////////

1486

/////////////////////////////////////////////////////////////////////////

1487

/////////////////////////////////////////////////////////////////////////

1488

/////////////////////////////////////////////////////////////////////////

1489

1490

1491

/*************************************************************************

1492

1493

* expects reads to have baseflags set (by performHashAnalysis())

1494

1495

* doesn't seem to be a good idea

1496

1497

*************************************************************************/

1537

/*************************************************************************

1538

1539

* sorter to sort Contig::templateguessinfo_t from low to high

1540

* first on readgroup id

1541

* then segment_placement

1542

* then on template size

1543

1544

*************************************************************************/

1545

1546

inline bool Assembly__sortTemplateGuessInfo_(const Contig::templateguessinfo_t & a,

1547

const Contig::templateguessinfo_t & b);

1548

inline bool Assembly__sortTemplateGuessInfo_(const Contig::templateguessinfo_t & a,

1549

const Contig::templateguessinfo_t & b)

1550

{

1551

if(a.rgid==b.rgid){

1552

if(a.splace_seen==b.splace_seen){

1553

return a.tsize_seen<b.tsize_seen;

1554

}

1555

return a.splace_seen<b.splace_seen;

1556

}

1557

return a.rgid<b.rgid;

1558

}

1559

1560

/*************************************************************************

1561

1562

* Destroys AS_templateguesses by sorting it, therefore cleared at end of func

1563

1564

1565

*************************************************************************/

1566

1567

struct atg_tmp_t {

1568

uint32 count;

1569

Contig::templateguessinfo_t tg;

1570

double mean;

1571

double stdev;

1572

double skewness;

1573

int32 deduced_min;

1574

int32 deduced_max;

1575

1576

friend std::ostream & operator<<(std::ostream &ostr, const atg_tmp_t & atgt){

1577

ostr << "rgid: " << atgt.tg.rgid.getLibId()

1578

<< "\tc: " << atgt.count

1579

<< "\tsp: " << static_cast<int16>(atgt.tg.splace_seen)

1580

<< "\tm: " << atgt.mean

1581

<< "\td: " << atgt.stdev

1582

<< "\ts: " << atgt.skewness

1583

<< "\t-: " << atgt.deduced_min

1584

<< "\t+: " << atgt.deduced_max;

1585

return ostr;

1586

}

1587

};

1588

1498

1589

1499

1590

//#define CEBUG(bla) {cout << bla; cout.flush();}

1500

1501

1502

1503

void Assembly::performHashEditing()

1591

void Assembly::analyseTemplateGuesses()

1504

1592

{

1505

FUNCSTART("void Assembly::performHashEditing()");

1506

1507

cout << "Hash analysis for editing:";

1508

1509

skim_parameters const & skim_params= AS_miraparams[0].getSkimParams();

1510

assembly_parameters const & as_fixparams= AS_miraparams[0].getAssemblyParams();

1511

1512

uint32 basesperhash=as_fixparams.as_clip_pec_basesperhash;

1513

if(sizeof(uint64) < 8 && basesperhash > 15) basesperhash=15;

1593

// cout << "tgs: " << AS_templateguesses.size() << endl;

1594

if(AS_templateguesses.empty()) return;

1595

sort(AS_templateguesses.begin(),AS_templateguesses.end(),Assembly__sortTemplateGuessInfo_);

1596

1597

1514

1598

{

1515

1516

Skim s3;

1517

1518

s3.setHashFrequencyRatios(skim_params.hs_freqest_minnormal,

1519

skim_params.hs_freqest_maxnormal,

1520

skim_params.hs_freqest_repeat,

1521

skim_params.hs_freqest_heavyrepeat,

1522

skim_params.hs_freqest_crazyrepeat,

1523

skim_params.hs_nastyrepeatratio);

1524

1525

s3.analyseHashes(AS_miraparams[0].getDirectoryParams().dir_tmp,

1526

AS_readpool,

1527

true,

1528

false,

1529

false,

1530

true,

1531

1532

basesperhash,

1533

1534

false);

1599

uint32 ind=0;

1600

for(auto & tge : AS_templateguesses){

1601

CEBUG(ind

1602

<< "\trgid: " << tge.rgid.getLibId()

1603

<< "\tspl: " << static_cast<int16>(tge.splace_seen)

1604

<< "\tts: " << tge.tsize_seen

1605

<< '\n'; ++ind);

1606

}

1535

1607

}

1536

1608

1537

if(as_fixparams.as_dateoutput) dateStamp(cout);

1538

cout << '\n';

1539

1540

cout << "Looking for proposed edits ... "; cout.flush();

1541

1542

vector<uint8> maxhf;

1543

maxhf.reserve(10000);

1544

1545

uint64 numbaseschanged=0;

1546

uint64 numreadschanged=0;

1547

1548

for(uint32 actid=0; actid<AS_readpool.size(); actid++){

1549

Read & r=AS_readpool.getRead(actid);

1550

1551

if(r.hasValidData()

1552

&& r.hasBaseHashStats()

1553

&& !(r.isBackbone()

1554

|| r.isRail())){

1555

1556

maxhf.clear();

1557

maxhf.resize(r.getLenSeq(),0);

1558

1559

bool wasedited=false;

1560

1561

{

1562

int32 lpos=r.getLeftClipoff();

1563

vector<Read::bposhashstat_t>::const_iterator bhsI=r.getBPosHashStats().begin();

1564

vector<uint8>::iterator mhfI=maxhf.begin();

1565

advance(bhsI,lpos);

1566

advance(mhfI,lpos);

1567

1568

uint32 counter=basesperhash;

1569

for(; lpos<static_cast<int32>(r.getLenSeq()); lpos++, bhsI++, mhfI++) {

1570

*mhfI=(bhsI->fwd.getFrequency())>1;

1571

if(*mhfI) counter=basesperhash;

1572

if(counter) {

1573

*mhfI=4;

1574

--counter;

1575

}

1576

}

1577

1578

lpos=r.getLeftClipoff();

1579

mhfI=maxhf.begin();

1580

advance(mhfI,lpos);

1581

1582

//for(; lpos<static_cast<int32>(r.getLenSeq()); lpos++) {

1583

// cout << (uint16) maxhf[lpos] << ' ';

1584

//}

1585

//cout << endl;

1586

//lpos=r.getLeftClipoff();

1587

//for(; lpos<static_cast<int32>(r.getLenSeq()); lpos++) {

1588

// cout << r.getBaseInSequence(lpos) << ' ';

1589

//}

1590

//cout << endl;

1591

//Read::setCoutType(Read::AS_TEXT);

1592

//cout << r;

1593

1594

lpos=r.getLeftClipoff();

1595

for(; lpos<static_cast<int32>(r.getLenSeq()); lpos++, mhfI++) {

1596

if(*mhfI) break;

1597

}

1598

1599

int32 editstart=-1;

1600

for(; lpos<static_cast<int32>(r.getLenSeq()); lpos++, mhfI++) {

1601

if(editstart<0){

1602

if(*mhfI==0) {

1603

editstart=lpos;

1604

}

1605

}else{

1606

if(*mhfI) {

1607

for(int32 ii=editstart; ii<lpos; ii++) {

1608

//editpositions.push_back(ii);

1609

r.changeBaseInSequence('n',0,ii);

1610

numbaseschanged++;

1611

wasedited=true;

1612

}

1613

editstart=-1;

1614

}

1615

}

1616

}

1617

1609

1610

vector<atg_tmp_t> atgpred;

1611

1612

// collapse AS_templateguesses into predictions grouped by rgid and segment_placement

1613

auto tgI=AS_templateguesses.cbegin();

1614

for(; tgI!=AS_templateguesses.cend() && tgI->rgid.isDefaultNonValidReadGroupID(); ++tgI) {}

1615

if(tgI!=AS_templateguesses.cend()){

1616

while(tgI!=AS_templateguesses.cend()){

1617

auto tgS=tgI;

1618

// search end iterator for that grouping

1619

for(; tgI!=AS_templateguesses.cend() && tgI->rgid==tgS->rgid && tgI->splace_seen == tgS->splace_seen; ++tgI) {};

1620

auto tgE=tgI;

1621

auto numvals=tgI-tgS;

1622

if(numvals>=100){

1623

atgpred.resize(atgpred.size()+1);

1624

atgpred.back().count=numvals;

1625

atgpred.back().tg=*tgS;

1626

1627

uint64 sum=0;

1628

for(tgI=tgS; tgI!=tgE; ++tgI) sum+=tgI->tsize_seen;

1629

double mean=static_cast<double>(sum)/static_cast<double>(numvals);

1630

double sp2sum=0.0d;

1631

for(tgI=tgS; tgI!=tgE; ++tgI) {

1632

auto m1=static_cast<double>(tgI->tsize_seen - mean);

1633

sp2sum+=m1*m1; // for stdev

1634

}

1635

double stdev=sqrt(sp2sum/(numvals-1));

1636

// skewness, see

1637

// http://www.itl.nist.gov/div898/handbook/eda/section3/eda35b.htm

1638

// http://www.tc3.edu/instruct/sbrown/stat/shape.htm

1639

1640

// skewness >0 left skew

1641

// skewness <0 right skew

1642

// abs skewness >= 1 very skewed

1643

// skewness >= .5 mildly skewed

1644

// skewness >= .25 slightly skewed (own definition as we're working with higher number of measurements

1645

1646

// but for large data sets with tens or hundreds of thousands of measurement, even a few outliers (<1%) on one side

1647

// (like happens in misassembled contigs) can induce a skew >1

1648

// Solution: recalc skew on a subset of values which takes only values at 3*stdev

1649

// also recalc new stdev on this subset

1650

1651

sp2sum=0.0d;

1652

double sp3sum=0.0d;

1653

double sxstdev=stdev*3;

1654

uint64 numvals2=0;

1655

for(tgI=tgS; tgI!=tgE; ++tgI) {

1656

auto m1=static_cast<double>(tgI->tsize_seen - mean);

1657

if(abs(m1)<=sxstdev){

1658

sp2sum+=m1*m1; // for stdev

1659

sp3sum+=m1*m1*m1; // for skewness

1660

++numvals2;

1661

}

1662

}

1663

stdev=sqrt(sp2sum/(numvals2-1));

1664

double skewness=0;

1665

if(stdev>0) skewness=sp3sum/((numvals2-1)*stdev*stdev*stdev);

1666

1667

// homebrew

1668

double lefttailfactor=2.0d;

1669

double righttailfactor=2.0d;

1670

if(skewness<=-1.0d){

1671

lefttailfactor=2.4d;

1672

righttailfactor=1.6d;

1673

}else if(skewness<=-0.5d){

1674

lefttailfactor=2.2d;

1675

righttailfactor=1.8d;

1676

}else if(skewness<=-0.25d){

1677

lefttailfactor=2.1d;

1678

righttailfactor=1.9d;

1679

}else if(skewness>=1.0d){

1680

lefttailfactor=1.6d;

1681

righttailfactor=2.4d;

1682

}else if(skewness>=0.5d){

1683

lefttailfactor=1.8d;

1684

righttailfactor=2.2d;

1685

}else if(skewness>=0.25d){

1686

lefttailfactor=1.9dd;

1687

righttailfactor=2.1d;

1688

}

1689

1690

atgpred.back().count=numvals2;

1691

atgpred.back().mean=mean;

1692

atgpred.back().stdev=stdev;

1693

atgpred.back().skewness=skewness;

1694

if(stdev>0){

1695

atgpred.back().deduced_min=(mean-lefttailfactor*stdev > 0) ? (mean-lefttailfactor*stdev) : 0;

1696

atgpred.back().deduced_max=mean+righttailfactor*stdev;

1697

}else{

1698

atgpred.back().deduced_min=mean-mean/10;

1699

atgpred.back().deduced_max=mean+mean/10;

1700

}

1618

1701

}

1619

if(wasedited) numreadschanged++;

1620

1621

//if(editpositions.size()){

1622

// cout << r.getName() << ": wants to edit " << editpositions.size() << " positions\n";

1623

//}

1624

}

1625

}

1626

1627

cout << "changed " << numbaseschanged << " bases to 'n' in " << numreadschanged << " reads.\n";

1628

1629

FUNCEND();

1630

1631

return;

1702

}

1703

}

1704

1705

cout << "ATG PREDICTIONS\n";

1706

cout << std::fixed << std::setprecision(10);

1707

for(auto & ae : atgpred){

1708

cout << ae << endl;

1709

}

1710

1711

// per readgroup, search the prediction with the highest count: that's going to be our final prediction

1712

auto atgpI=atgpred.cbegin();

1713

while(atgpI!=atgpred.cend()){

1714

auto best=*atgpI;

1715

for(; atgpI!=atgpred.cend() && atgpI->tg.rgid==best.tg.rgid; ++atgpI){

1716

if(atgpI->count >best.count) best=*atgpI;

1717

}

1718

cout << "Final prediction: " << best << endl;

1719

if(best.tg.rgid.wantSegmentPlacementEstimate()){

1720

best.tg.rgid.setSegmentPlacementCode(best.tg.splace_seen);

1721

best.tg.rgid.setWantSegmentPlacementEstimate(false);

1722

cout << "Set segment placement code.\n";

1723

AS_guessedtemplatevalues=true;

1724

} else if(best.tg.rgid.wantTemplateInfoEstimate()){

1725

best.tg.rgid.setInsizeFrom(best.deduced_min);

1726

best.tg.rgid.setInsizeTo(best.deduced_max);

1727

best.tg.rgid.setWantTemplateSizeEstimate(false);

1728

cout << "Set template size.\n";

1729

AS_guessedtemplatevalues=true;

1730

}

1731

}

1732

1733

AS_templateguesses.empty();

1632

1734

}

1633

1735

//#define CEBUG(bla)

1634

1635

1636

1637

/*************************************************************************

1638

1639

* BaCh 31.12.2012: Errrrm ... what's that function for? Was probably a

1640

* quick hack for something or I planed some "extra" file for very

1641

* special cases. Should probably be removed.

1642

1643

* REMOVEME!

1644

1645

* ugly and slow, but works and is fast enough

1646

1647

*************************************************************************/

1648

//#define CEBUG(bla) {cout << bla; cout.flush(); }

1649

1650

void Assembly::mergeTemplateInfo(const string & tifile, const string & logname, const string & logprefix)

1651

{

1652

FUNCSTART("void Assembly::mergeTemplateInfo(const string & tifile, const string & logname, const string & logprefix)");

1653

1654

cout << "Merging template info from " << tifile << ":\n";

1655

1656

CEBUG("Building hash table ... "); cout.flush();

1657

1658

typedef boost::unordered_map<std::string, int32> strmap;

1659

strmap rnmap;

1660

strmap::iterator rnI;

1661

1662

for(uint32 i=0; i<AS_readpool.size();i++){

1663

if(!AS_readpool[i].getName().empty()) {

1664

rnmap[AS_readpool[i].getName()]=i;

1665

}

1666

}

1667

CEBUG("done." << endl);

1668

1669

ofstream logfout;

1670

if(!logname.empty()){

1671

logfout.open(logname.c_str(), ios::out|ios::app);

1672

if(!logfout){

1673

MIRANOTIFY(Notify::FATAL, "Could not open log for appending: " << logname << "\nPossible causes: Disk full? Changed permissions? Directory deleted?");

1674

}

1675

}

1676

1677

ifstream tifin;

1678

tifin.open(tifile.c_str(), ios::in|ios::ate);

1679

if(!tifin){

1680

MIRANOTIFY(Notify::FATAL, "File not found: " << tifile);

1681

}

1682

streampos tifsize=tifin.tellg();

1683

tifin.seekg(0, ios::beg);

1684

1685

ProgressIndicator<streamsize> P (0, tifsize,1000);

1686

1687

string token;

1688

1689

while(!tifin.eof()){

1690

tifin >> token;

1691

if(tifin.eof()) break;

1692

if(P.delaytrigger()) P.progress(tifin.tellg());

1693

1694

//tifin >> sd_score >> sd_readname;

1695

1696

if(tifin.eof()) break;

1697

1698

if(token[0]=='+'){

1699

// new lib

1736

1737

1738

bool Assembly::warnAtSmileCoverage()

1739

{

1740

string wstr;

1741

if(AS_assemblyinfo.huntForSmileCoverage(wstr)){

1742

AS_warnings.setWarning("CONCOV_SUSPICIOUS_DISTRIBUTION",0,"Suspicious distribution of contig coverages",wstr);

1743

}

1744

return !wstr.empty();

1745

}

1746

1747

bool Assembly::warnAtHighCoverages(uint32 measuredcov)

1748

{

1749

bool retval=false;

1750

if(AS_miraparams[0].getNagAndWarnParams().nw_check_coverage!=NWNONE

1751

&& AS_miraparams[0].getPathfinderParams().paf_use_genomic_algorithms

1752

&& measuredcov>AS_miraparams[0].getNagAndWarnParams().nw_check_covvalue){

1753

string wstr("You are running a genome ");

1754

if(AS_miraparams[0].getAssemblyParams().as_assemblyjob_mapping){

1755

wstr+="mapping";

1700

1756

}else{

1701

// existing name

1702

bool foundname=false;

1703

rnI=rnmap.find(token);

1704

if(rnI==rnmap.end()) {

1705

CEBUG("Not found: " << token << endl);

1706

continue;

1757

wstr+="de-novo";

1758

}

1759

wstr+=" assembly and the current best estimation for average coverage is "

1760

+boost::lexical_cast<string>(measuredcov)

1761

+"x (note that this number can be +/- 20% off the real value). This is ";

1762

if(measuredcov>=80) wstr+="a pretty high coverage,";

1763

wstr+="higher than the current warning threshold of "

1764

+boost::lexical_cast<string>(AS_miraparams[0].getNagAndWarnParams().nw_check_covvalue)

1765

+"x."

1766

"\n\nYou should try to get the average coverage not higher than, say, 60x to 100x for Illumina data or 40x to 60x for 454 and Ion Torrent data. Hybrid assemblies should target a total coverage of 80x to 100x as upper bound. For that, please downsample your input data."

1767

"\n\nThis warning has two major reasons:"

1768

"\n- for MIRA and other overlap based assemblers, the runtime and memory requirements for ultra-high coverage projects grow exponentially, so reducing the data helps you there"

1769

"\n- for all assemblers, the contiguity of an assembly can also suffer if the coverage is too high, i.e. you get more contigs than you would otherwise. Causes for this effect can be non-random sequencing errors or low frequency sub-populations with SNPs which become strong enough to be mistaken for possible repeats.";

1770

if(measuredcov>=150){

1771

if(measuredcov>=300){

1772

wstr+="\nA coverage of >300x ... no really, are you kidding me? *sigh*";

1707

1773

}

1708

uint32 foundreadid=rnI->second;

1709

if(!AS_readpool[foundreadid].hasValidData()) continue;

1710

1711

Read actread(AS_readpool[foundreadid]);

1712

assembly_parameters const & as_params= AS_miraparams[actread.getSequencingType()].getAssemblyParams();

1774

wstr+="\nWith the coverage you currently have, you *really* should downsample your data. You. Have. Been. Warned!";

1713

1775

}

1714

}

1715

P.finishAtOnce();

1716

1717

tifin.close();

1718

1719

if(!logname.empty()){

1720

logfout.close();

1721

}

1722

1723

cout << "\nDone." << endl;

1724

1725

1726

FUNCEND();

1727

return;

1776

AS_warnings.setWarning("ASCOV_VERY_HIGH",1,"Very high average coverage",wstr);

1777

retval=true;

1778

}

1779

return retval;

1728

1780

}

1729

//#define CEBUG(bla)

1730

1731

1732

1781

1733

1782

1734

1783

1735

1784

/*************************************************************************

1736

1785

1737

* splits a sequence into overlapping subsequences

1738

1739

* AND

1740

1741

* saves pre-computed adsfacts file into log directory for later

1742

* later reading

1743

* number of generated adsfacts is put in AS_numADSFacts_fromshreds

1744

1745

1746

* This saves enormous amount of time, but is not the "real" thing:

1747

* matches between shreds that are non-overlapping from the start on are

1748

* not made

1786

* mean function, with side-effect and definetely to be used only where

1787

* it is currently called from

1788

1789

* Used to guess which inserts in a contigs are wrong after a first

1790

* mapping round and makes sure they get removed

1791

1792

* E.g.:

1793

* bb cagtcatga***ctgcatgca

1794

* r1 cag*catga***ctgcatgca

1795

* r2 cag*catga***ctgcatgca

1796

* r3 cag*catga***ctgcatgca

1797

* ...

1798

* rX cag*catgaTTTctgcatgca

1799

1800

* with rX being some weird read (maybe low frequency variant, or sequencing

1801

* error, or ...)

1802

1803

* Normally, the two stage mapping would calculate an intermediate new backbone

1804

* to be

1805

1806

* bbi cag*catga***ctgcatgca

1807

1808

* but that is obviously not really the best guess and currently leads to

1809

* misalignments with reads ending in this area. E.g.:

1810

1811

* bbi cag*catga***ctgcatgca

1812

* rY actgcatgca

1813

1814

* instead of

1815

1816

* bbi cag*catga***ctgcatgca

1817

* rY a***ctgcatgca

1818

1819

* This function will fake the contig CON_counts structure to enable

1820

* Contig::deleteStarOnlyColumns() to remove the probably wrong inserts, so

1821

* that the new intermediate contig looks like this:

1822

1823

* bbi cag*catgactgcatgca

1824

1825

* Side-effects:

1826

* - CON_counts of contig is probably not reflecting truth anymore

1827

* - reads of the contig also get edited, but this routine

1828

* is currently used just before all the reads are discarded anyway.

1749

1829

1750

1830

*************************************************************************/

1751

1831

1752

1753

void Assembly::shredReadsIntoReadPool(ReadPool & sourcepool, uint32 shredlen, uint32 shredoffsetinc, uint8 shredreadtype, const string & shredstrain)

1832

void Assembly::priv_removePotentiallyWrongBaseInserts(Contig & con)

1754

1833

{

1755

FUNCSTART("void Assembly::shredReadsIntoReadPool(ReadPool & sourcepool, uint32 shredlen, uint32 shredoffsetinc, uint8 shredreadtype, const string & shredstrain)");

1756

1757

AS_numADSFacts_fromshreds=0;

1758

string adsfshredsfilename=AS_miraparams[0].getDirectoryParams().dir_tmp+"/shred.adsfacts";

1759

ofstream adsfout;

1760

adsfout.open((adsfshredsfilename+".adsfacts").c_str(), ios::out|ios::trunc);

1761

1762

deque<uint32> overlapfifo;

1763

1764

string shredseq;

1765

shredseq.reserve(shredlen);

1766

vector<base_quality_t> shredqual;

1767

shredqual.reserve(shredlen+10);

1768

string shredname;

1769

1770

for(uint32 actsourceid=0; actsourceid < sourcepool.size(); actsourceid++){

1771

Read & sourceread = sourcepool.getRead(actsourceid);

1772

if(!sourceread.hasValidData()) continue;

1773

if(sourceread.getLenSeq() < shredlen) continue;

1774

1775

uint32 actoffset=0;

1776

uint32 shredcounter=0;

1777

for(bool doloop=true; doloop; actoffset+=shredoffsetinc){

1778

uint32 fromi=actoffset;

1779

uint32 toi=actoffset+shredlen;

1780

if(toi>=sourceread.getLenSeq()) {

1781

toi=sourceread.getLenSeq();

1782

doloop=false;

1783

}

1784

shredseq.clear();

1785

shredqual.clear();

1786

for(; fromi<toi; fromi++){

1787

shredseq+=sourceread.getBaseInSequence(fromi);

1788

shredqual.push_back(sourceread.getQualityInSequence(fromi));

1789

}

1790

1791

// if wished: lower quals to max as_cap454consensusqual

1792

if(AS_miraparams[0].getAssemblyParams().as_cap454consensusqual>0){

1793

vector<base_quality_t>::iterator qI=shredqual.begin();

1794

base_quality_t maxqual=AS_miraparams[0].getAssemblyParams().as_cap454consensusqual;

1795

for(;qI != shredqual.end(); qI++){

1796

if(*qI>maxqual) *qI=maxqual;

1797

}

1798

}

1799

1800

ostringstream ostr;

1801

ostr << "shred_" << shredcounter << "_" << sourceread.getName();

1802

shredname=ostr.str();

1803

1804

AS_readpool.addNewEmptyRead();

1805

uint32 newreadid=AS_readpool.size()-1;

1806

Read & newread=AS_readpool.getRead(newreadid);

1807

newread.setName(shredname);

1808

newread.setSequenceFromString(shredseq);

1809

newread.setQualities(shredqual);

1810

newread.setStrain(shredstrain.c_str());

1811

newread.setSequencingType(shredreadtype);

1812

1813

//cout << "\n----------------------------------------\nAdded " << shredname << '\n';

1814

// now insert the weights

1815

{

1816

overlapfifo.push_front(newreadid);

1817

deque<uint32>::iterator OFI=overlapfifo.begin();

1818

OFI++;

1819

int32 overlaplen=shredlen-shredoffsetinc;

1820

int32 totalshredoffset=shredoffsetinc;

1821

uint32 numelements=1;

1822

while(OFI != overlapfifo.end()) {

1823

if(overlaplen<=0) break;

1824

1825

AlignedDualSeqFacts tmpadsf;

1826

tmpadsf.publicinit(

1827

*OFI,

1828

newreadid,

1829

static_cast<uint16>(totalshredoffset),

1830

static_cast<uint16>(totalshredoffset

1831

-(AS_readpool.getRead(*OFI).getLenSeq()

1832

-AS_readpool.getRead(newreadid).getLenSeq())),

1833

1834

static_cast<uint16>((AS_readpool.getRead(*OFI).getLenSeq()+

1835

AS_readpool.getRead(newreadid).getLenSeq()-overlaplen)),

1836

1837

1838

100);

1839

1840

// output of the ADSfacts to file

1841

// TODO: real ouput

1842

// first weight and direction

1843

// TODO: reduce weight to favorise real reads in assembly???

1844

adsfout << overlaplen*10000 << "\t1\t";

1845

tmpadsf.serialiseOut(adsfout);

1846

adsfout << '\n';

1847

1848

AS_numADSFacts_fromshreds++;

1849

1850

OFI++;

1851

overlaplen-=shredoffsetinc;

1852

totalshredoffset+=shredoffsetinc;

1853

numelements++;

1854

}

1855

if(overlapfifo.size()>numelements) overlapfifo.resize(numelements);

1856

}

1857

shredcounter++;

1858

}

1859

cout << "Shredded " << sourceread.getName() << " into " << shredcounter << " pieces.\n";

1860

}

1861

1862

adsfout.close();

1863

1864

FUNCEND();

1834

FUNCSTART("void Assembly::priv_removePotentiallyWrongBaseInserts(Contig & con)");

1835

1836

// try to look for strains which are only not in backbone

1837

vector<string> straincons(ReadGroupLib::getNumOfStrains());

1838

vector<base_quality_t> dummyqual;

1839

1840

for(uint32 sid=0; sid<ReadGroupLib::getNumOfStrains(); ++sid){

1841

bool takesid=false;

1842

for(uint32 rgid=1; rgid<ReadGroupLib::getNumReadGroups();++rgid){

1843

auto rg=ReadGroupLib::getReadGroupID(rgid);

1844

if(rg.getStrainID()==sid

1845

&& !rg.isBackbone()

1846

&& !rg.isRail()){

1847

takesid=true;

1848

break;

1849

}

1850

}

1851

if(takesid){

1852

con.newConsensusGet(straincons[sid],dummyqual,sid);

1853

}

1854

}

1855

1856

// look if we have *any* consensus made (meaning we'd have same strain backbone AND mapped reads).

1857

// If not, recalc consensi for all strains

1858

{

1859

bool needrecalc=true;

1860

for(auto & s : straincons){

1861

if(!s.empty()) {

1862

needrecalc=false;

1863

break;

1864

}

1865

}

1866

if(needrecalc){

1867

for(uint32 sid=0; sid<ReadGroupLib::getNumOfStrains(); ++sid){

1868

con.newConsensusGet(straincons[sid],dummyqual,sid);

1869

}

1870

}

1871

}

1872

1873

{

1874

bool stillnocons=true;

1875

for(auto & s : straincons){

1876

if(!s.empty()) {

1877

stillnocons=false;

1878

break;

1879

}

1880

}

1881

BUGIFTHROW(stillnocons,"Ummmm ... no cons built???");

1882

}

1883

1884

// fill up empty consensi

1885

for(auto & s : straincons){

1886

if(s.empty()) s.resize(con.getContigLength(),'*');

1887

}

1888

1889

Contig::cccontainer_t & cc = const_cast<Contig::cccontainer_t &>(con.getConsensusCounts());

1890

1891

auto ccI=cc.begin();

1892

for(uint32 actcontigpos=0; actcontigpos<straincons[0].size(); ++actcontigpos, ++ccI){

1893

bool maydelete=true;

1894

for(auto & s : straincons){

1895

if(s[actcontigpos]!='*'){

1896

maydelete=false;

1897

break;

1898

}

1899

}

1900

if(maydelete){

1901

ccI->total_cov=65535;

1902

ccI->star=65535;

1903

}

1904

}

1905

con.deleteStarOnlyColumns(0,con.getContigLength(),false,65535);

1865

1906

}

1866

Older »