1475
/////////////////////////////////////////////////////////////////////////
1476
/////////////////////////////////////////////////////////////////////////
1477
/////////////////////////////////////////////////////////////////////////
1478
/////////////////////////////////////////////////////////////////////////
1479
/////////////////////////////////////////////////////////////////////////
1480
/////////////////////////////////////////////////////////////////////////
1481
///////////////////////// Obsolete ///////////////////////
1482
/////////////////////////////////////////////////////////////////////////
1483
/////////////////////////////////////////////////////////////////////////
1484
/////////////////////////////////////////////////////////////////////////
1485
/////////////////////////////////////////////////////////////////////////
1486
/////////////////////////////////////////////////////////////////////////
1487
/////////////////////////////////////////////////////////////////////////
1488
/////////////////////////////////////////////////////////////////////////
1491
/*************************************************************************
1493
* expects reads to have baseflags set (by performHashAnalysis())
1495
* doesn't seem to be a good idea
1497
*************************************************************************/
1537
/*************************************************************************
1539
* sorter to sort Contig::templateguessinfo_t from low to high
1540
* first on readgroup id
1541
* then segment_placement
1542
* then on template size
1544
*************************************************************************/
1546
inline bool Assembly__sortTemplateGuessInfo_(const Contig::templateguessinfo_t & a,
1547
const Contig::templateguessinfo_t & b);
1548
inline bool Assembly__sortTemplateGuessInfo_(const Contig::templateguessinfo_t & a,
1549
const Contig::templateguessinfo_t & b)
1552
if(a.splace_seen==b.splace_seen){
1553
return a.tsize_seen<b.tsize_seen;
1555
return a.splace_seen<b.splace_seen;
1557
return a.rgid<b.rgid;
1560
/*************************************************************************
1562
* Destroys AS_templateguesses by sorting it, therefore cleared at end of func
1565
*************************************************************************/
1569
Contig::templateguessinfo_t tg;
1576
friend std::ostream & operator<<(std::ostream &ostr, const atg_tmp_t & atgt){
1577
ostr << "rgid: " << atgt.tg.rgid.getLibId()
1578
<< "\tc: " << atgt.count
1579
<< "\tsp: " << static_cast<int16>(atgt.tg.splace_seen)
1580
<< "\tm: " << atgt.mean
1581
<< "\td: " << atgt.stdev
1582
<< "\ts: " << atgt.skewness
1583
<< "\t-: " << atgt.deduced_min
1584
<< "\t+: " << atgt.deduced_max;
1499
1590
//#define CEBUG(bla) {cout << bla; cout.flush();}
1503
void Assembly::performHashEditing()
1591
void Assembly::analyseTemplateGuesses()
1505
FUNCSTART("void Assembly::performHashEditing()");
1507
cout << "Hash analysis for editing:";
1509
skim_parameters const & skim_params= AS_miraparams[0].getSkimParams();
1510
assembly_parameters const & as_fixparams= AS_miraparams[0].getAssemblyParams();
1512
uint32 basesperhash=as_fixparams.as_clip_pec_basesperhash;
1513
if(sizeof(uint64) < 8 && basesperhash > 15) basesperhash=15;
1593
// cout << "tgs: " << AS_templateguesses.size() << endl;
1594
if(AS_templateguesses.empty()) return;
1595
sort(AS_templateguesses.begin(),AS_templateguesses.end(),Assembly__sortTemplateGuessInfo_);
1518
s3.setHashFrequencyRatios(skim_params.hs_freqest_minnormal,
1519
skim_params.hs_freqest_maxnormal,
1520
skim_params.hs_freqest_repeat,
1521
skim_params.hs_freqest_heavyrepeat,
1522
skim_params.hs_freqest_crazyrepeat,
1523
skim_params.hs_nastyrepeatratio);
1525
s3.analyseHashes(AS_miraparams[0].getDirectoryParams().dir_tmp,
1600
for(auto & tge : AS_templateguesses){
1602
<< "\trgid: " << tge.rgid.getLibId()
1603
<< "\tspl: " << static_cast<int16>(tge.splace_seen)
1604
<< "\tts: " << tge.tsize_seen
1537
if(as_fixparams.as_dateoutput) dateStamp(cout);
1540
cout << "Looking for proposed edits ... "; cout.flush();
1542
vector<uint8> maxhf;
1543
maxhf.reserve(10000);
1545
uint64 numbaseschanged=0;
1546
uint64 numreadschanged=0;
1548
for(uint32 actid=0; actid<AS_readpool.size(); actid++){
1549
Read & r=AS_readpool.getRead(actid);
1552
&& r.hasBaseHashStats()
1557
maxhf.resize(r.getLenSeq(),0);
1559
bool wasedited=false;
1562
int32 lpos=r.getLeftClipoff();
1563
vector<Read::bposhashstat_t>::const_iterator bhsI=r.getBPosHashStats().begin();
1564
vector<uint8>::iterator mhfI=maxhf.begin();
1568
uint32 counter=basesperhash;
1569
for(; lpos<static_cast<int32>(r.getLenSeq()); lpos++, bhsI++, mhfI++) {
1570
*mhfI=(bhsI->fwd.getFrequency())>1;
1571
if(*mhfI) counter=basesperhash;
1578
lpos=r.getLeftClipoff();
1582
//for(; lpos<static_cast<int32>(r.getLenSeq()); lpos++) {
1583
// cout << (uint16) maxhf[lpos] << ' ';
1586
//lpos=r.getLeftClipoff();
1587
//for(; lpos<static_cast<int32>(r.getLenSeq()); lpos++) {
1588
// cout << r.getBaseInSequence(lpos) << ' ';
1591
//Read::setCoutType(Read::AS_TEXT);
1594
lpos=r.getLeftClipoff();
1595
for(; lpos<static_cast<int32>(r.getLenSeq()); lpos++, mhfI++) {
1600
for(; lpos<static_cast<int32>(r.getLenSeq()); lpos++, mhfI++) {
1607
for(int32 ii=editstart; ii<lpos; ii++) {
1608
//editpositions.push_back(ii);
1609
r.changeBaseInSequence('n',0,ii);
1610
vector<atg_tmp_t> atgpred;
1612
// collapse AS_templateguesses into predictions grouped by rgid and segment_placement
1613
auto tgI=AS_templateguesses.cbegin();
1614
for(; tgI!=AS_templateguesses.cend() && tgI->rgid.isDefaultNonValidReadGroupID(); ++tgI) {}
1615
if(tgI!=AS_templateguesses.cend()){
1616
while(tgI!=AS_templateguesses.cend()){
1618
// search end iterator for that grouping
1619
for(; tgI!=AS_templateguesses.cend() && tgI->rgid==tgS->rgid && tgI->splace_seen == tgS->splace_seen; ++tgI) {};
1621
auto numvals=tgI-tgS;
1623
atgpred.resize(atgpred.size()+1);
1624
atgpred.back().count=numvals;
1625
atgpred.back().tg=*tgS;
1628
for(tgI=tgS; tgI!=tgE; ++tgI) sum+=tgI->tsize_seen;
1629
double mean=static_cast<double>(sum)/static_cast<double>(numvals);
1631
for(tgI=tgS; tgI!=tgE; ++tgI) {
1632
auto m1=static_cast<double>(tgI->tsize_seen - mean);
1633
sp2sum+=m1*m1; // for stdev
1635
double stdev=sqrt(sp2sum/(numvals-1));
1637
// http://www.itl.nist.gov/div898/handbook/eda/section3/eda35b.htm
1638
// http://www.tc3.edu/instruct/sbrown/stat/shape.htm
1640
// skewness >0 left skew
1641
// skewness <0 right skew
1642
// abs skewness >= 1 very skewed
1643
// skewness >= .5 mildly skewed
1644
// skewness >= .25 slightly skewed (own definition as we're working with higher number of measurements
1646
// but for large data sets with tens or hundreds of thousands of measurement, even a few outliers (<1%) on one side
1647
// (like happens in misassembled contigs) can induce a skew >1
1648
// Solution: recalc skew on a subset of values which takes only values at 3*stdev
1649
// also recalc new stdev on this subset
1653
double sxstdev=stdev*3;
1655
for(tgI=tgS; tgI!=tgE; ++tgI) {
1656
auto m1=static_cast<double>(tgI->tsize_seen - mean);
1657
if(abs(m1)<=sxstdev){
1658
sp2sum+=m1*m1; // for stdev
1659
sp3sum+=m1*m1*m1; // for skewness
1663
stdev=sqrt(sp2sum/(numvals2-1));
1665
if(stdev>0) skewness=sp3sum/((numvals2-1)*stdev*stdev*stdev);
1668
double lefttailfactor=2.0d;
1669
double righttailfactor=2.0d;
1670
if(skewness<=-1.0d){
1671
lefttailfactor=2.4d;
1672
righttailfactor=1.6d;
1673
}else if(skewness<=-0.5d){
1674
lefttailfactor=2.2d;
1675
righttailfactor=1.8d;
1676
}else if(skewness<=-0.25d){
1677
lefttailfactor=2.1d;
1678
righttailfactor=1.9d;
1679
}else if(skewness>=1.0d){
1680
lefttailfactor=1.6d;
1681
righttailfactor=2.4d;
1682
}else if(skewness>=0.5d){
1683
lefttailfactor=1.8d;
1684
righttailfactor=2.2d;
1685
}else if(skewness>=0.25d){
1686
lefttailfactor=1.9dd;
1687
righttailfactor=2.1d;
1690
atgpred.back().count=numvals2;
1691
atgpred.back().mean=mean;
1692
atgpred.back().stdev=stdev;
1693
atgpred.back().skewness=skewness;
1695
atgpred.back().deduced_min=(mean-lefttailfactor*stdev > 0) ? (mean-lefttailfactor*stdev) : 0;
1696
atgpred.back().deduced_max=mean+righttailfactor*stdev;
1698
atgpred.back().deduced_min=mean-mean/10;
1699
atgpred.back().deduced_max=mean+mean/10;
1619
if(wasedited) numreadschanged++;
1621
//if(editpositions.size()){
1622
// cout << r.getName() << ": wants to edit " << editpositions.size() << " positions\n";
1627
cout << "changed " << numbaseschanged << " bases to 'n' in " << numreadschanged << " reads.\n";
1705
cout << "ATG PREDICTIONS\n";
1706
cout << std::fixed << std::setprecision(10);
1707
for(auto & ae : atgpred){
1711
// per readgroup, search the prediction with the highest count: that's going to be our final prediction
1712
auto atgpI=atgpred.cbegin();
1713
while(atgpI!=atgpred.cend()){
1715
for(; atgpI!=atgpred.cend() && atgpI->tg.rgid==best.tg.rgid; ++atgpI){
1716
if(atgpI->count >best.count) best=*atgpI;
1718
cout << "Final prediction: " << best << endl;
1719
if(best.tg.rgid.wantSegmentPlacementEstimate()){
1720
best.tg.rgid.setSegmentPlacementCode(best.tg.splace_seen);
1721
best.tg.rgid.setWantSegmentPlacementEstimate(false);
1722
cout << "Set segment placement code.\n";
1723
AS_guessedtemplatevalues=true;
1724
} else if(best.tg.rgid.wantTemplateInfoEstimate()){
1725
best.tg.rgid.setInsizeFrom(best.deduced_min);
1726
best.tg.rgid.setInsizeTo(best.deduced_max);
1727
best.tg.rgid.setWantTemplateSizeEstimate(false);
1728
cout << "Set template size.\n";
1729
AS_guessedtemplatevalues=true;
1733
AS_templateguesses.empty();
1633
1735
//#define CEBUG(bla)
1637
/*************************************************************************
1639
* BaCh 31.12.2012: Errrrm ... what's that function for? Was probably a
1640
* quick hack for something or I planed some "extra" file for very
1641
* special cases. Should probably be removed.
1645
* ugly and slow, but works and is fast enough
1647
*************************************************************************/
1648
//#define CEBUG(bla) {cout << bla; cout.flush(); }
1650
void Assembly::mergeTemplateInfo(const string & tifile, const string & logname, const string & logprefix)
1652
FUNCSTART("void Assembly::mergeTemplateInfo(const string & tifile, const string & logname, const string & logprefix)");
1654
cout << "Merging template info from " << tifile << ":\n";
1656
CEBUG("Building hash table ... "); cout.flush();
1658
typedef boost::unordered_map<std::string, int32> strmap;
1660
strmap::iterator rnI;
1662
for(uint32 i=0; i<AS_readpool.size();i++){
1663
if(!AS_readpool[i].getName().empty()) {
1664
rnmap[AS_readpool[i].getName()]=i;
1667
CEBUG("done." << endl);
1670
if(!logname.empty()){
1671
logfout.open(logname.c_str(), ios::out|ios::app);
1673
MIRANOTIFY(Notify::FATAL, "Could not open log for appending: " << logname << "\nPossible causes: Disk full? Changed permissions? Directory deleted?");
1678
tifin.open(tifile.c_str(), ios::in|ios::ate);
1680
MIRANOTIFY(Notify::FATAL, "File not found: " << tifile);
1682
streampos tifsize=tifin.tellg();
1683
tifin.seekg(0, ios::beg);
1685
ProgressIndicator<streamsize> P (0, tifsize,1000);
1689
while(!tifin.eof()){
1691
if(tifin.eof()) break;
1692
if(P.delaytrigger()) P.progress(tifin.tellg());
1694
//tifin >> sd_score >> sd_readname;
1696
if(tifin.eof()) break;
1738
bool Assembly::warnAtSmileCoverage()
1741
if(AS_assemblyinfo.huntForSmileCoverage(wstr)){
1742
AS_warnings.setWarning("CONCOV_SUSPICIOUS_DISTRIBUTION",0,"Suspicious distribution of contig coverages",wstr);
1744
return !wstr.empty();
1747
bool Assembly::warnAtHighCoverages(uint32 measuredcov)
1750
if(AS_miraparams[0].getNagAndWarnParams().nw_check_coverage!=NWNONE
1751
&& AS_miraparams[0].getPathfinderParams().paf_use_genomic_algorithms
1752
&& measuredcov>AS_miraparams[0].getNagAndWarnParams().nw_check_covvalue){
1753
string wstr("You are running a genome ");
1754
if(AS_miraparams[0].getAssemblyParams().as_assemblyjob_mapping){
1702
bool foundname=false;
1703
rnI=rnmap.find(token);
1704
if(rnI==rnmap.end()) {
1705
CEBUG("Not found: " << token << endl);
1759
wstr+=" assembly and the current best estimation for average coverage is "
1760
+boost::lexical_cast<string>(measuredcov)
1761
+"x (note that this number can be +/- 20% off the real value). This is ";
1762
if(measuredcov>=80) wstr+="a pretty high coverage,";
1763
wstr+="higher than the current warning threshold of "
1764
+boost::lexical_cast<string>(AS_miraparams[0].getNagAndWarnParams().nw_check_covvalue)
1766
"\n\nYou should try to get the average coverage not higher than, say, 60x to 100x for Illumina data or 40x to 60x for 454 and Ion Torrent data. Hybrid assemblies should target a total coverage of 80x to 100x as upper bound. For that, please downsample your input data."
1767
"\n\nThis warning has two major reasons:"
1768
"\n- for MIRA and other overlap based assemblers, the runtime and memory requirements for ultra-high coverage projects grow exponentially, so reducing the data helps you there"
1769
"\n- for all assemblers, the contiguity of an assembly can also suffer if the coverage is too high, i.e. you get more contigs than you would otherwise. Causes for this effect can be non-random sequencing errors or low frequency sub-populations with SNPs which become strong enough to be mistaken for possible repeats.";
1770
if(measuredcov>=150){
1771
if(measuredcov>=300){
1772
wstr+="\nA coverage of >300x ... no really, are you kidding me? *sigh*";
1708
uint32 foundreadid=rnI->second;
1709
if(!AS_readpool[foundreadid].hasValidData()) continue;
1711
Read actread(AS_readpool[foundreadid]);
1712
assembly_parameters const & as_params= AS_miraparams[actread.getSequencingType()].getAssemblyParams();
1774
wstr+="\nWith the coverage you currently have, you *really* should downsample your data. You. Have. Been. Warned!";
1719
if(!logname.empty()){
1723
cout << "\nDone." << endl;
1776
AS_warnings.setWarning("ASCOV_VERY_HIGH",1,"Very high average coverage",wstr);
1729
//#define CEBUG(bla)
1735
1784
/*************************************************************************
1737
* splits a sequence into overlapping subsequences
1741
* saves pre-computed adsfacts file into log directory for later
1743
* number of generated adsfacts is put in AS_numADSFacts_fromshreds
1746
* This saves enormous amount of time, but is not the "real" thing:
1747
* matches between shreds that are non-overlapping from the start on are
1786
* mean function, with side-effect and definetely to be used only where
1787
* it is currently called from
1789
* Used to guess which inserts in a contigs are wrong after a first
1790
* mapping round and makes sure they get removed
1793
* bb cagtcatga***ctgcatgca
1794
* r1 cag*catga***ctgcatgca
1795
* r2 cag*catga***ctgcatgca
1796
* r3 cag*catga***ctgcatgca
1798
* rX cag*catgaTTTctgcatgca
1800
* with rX being some weird read (maybe low frequency variant, or sequencing
1803
* Normally, the two stage mapping would calculate an intermediate new backbone
1806
* bbi cag*catga***ctgcatgca
1808
* but that is obviously not really the best guess and currently leads to
1809
* misalignments with reads ending in this area. E.g.:
1811
* bbi cag*catga***ctgcatgca
1816
* bbi cag*catga***ctgcatgca
1819
* This function will fake the contig CON_counts structure to enable
1820
* Contig::deleteStarOnlyColumns() to remove the probably wrong inserts, so
1821
* that the new intermediate contig looks like this:
1823
* bbi cag*catgactgcatgca
1826
* - CON_counts of contig is probably not reflecting truth anymore
1827
* - reads of the contig also get edited, but this routine
1828
* is currently used just before all the reads are discarded anyway.
1750
1830
*************************************************************************/
1753
void Assembly::shredReadsIntoReadPool(ReadPool & sourcepool, uint32 shredlen, uint32 shredoffsetinc, uint8 shredreadtype, const string & shredstrain)
1832
void Assembly::priv_removePotentiallyWrongBaseInserts(Contig & con)
1755
FUNCSTART("void Assembly::shredReadsIntoReadPool(ReadPool & sourcepool, uint32 shredlen, uint32 shredoffsetinc, uint8 shredreadtype, const string & shredstrain)");
1757
AS_numADSFacts_fromshreds=0;
1758
string adsfshredsfilename=AS_miraparams[0].getDirectoryParams().dir_tmp+"/shred.adsfacts";
1760
adsfout.open((adsfshredsfilename+".adsfacts").c_str(), ios::out|ios::trunc);
1762
deque<uint32> overlapfifo;
1765
shredseq.reserve(shredlen);
1766
vector<base_quality_t> shredqual;
1767
shredqual.reserve(shredlen+10);
1770
for(uint32 actsourceid=0; actsourceid < sourcepool.size(); actsourceid++){
1771
Read & sourceread = sourcepool.getRead(actsourceid);
1772
if(!sourceread.hasValidData()) continue;
1773
if(sourceread.getLenSeq() < shredlen) continue;
1776
uint32 shredcounter=0;
1777
for(bool doloop=true; doloop; actoffset+=shredoffsetinc){
1778
uint32 fromi=actoffset;
1779
uint32 toi=actoffset+shredlen;
1780
if(toi>=sourceread.getLenSeq()) {
1781
toi=sourceread.getLenSeq();
1786
for(; fromi<toi; fromi++){
1787
shredseq+=sourceread.getBaseInSequence(fromi);
1788
shredqual.push_back(sourceread.getQualityInSequence(fromi));
1791
// if wished: lower quals to max as_cap454consensusqual
1792
if(AS_miraparams[0].getAssemblyParams().as_cap454consensusqual>0){
1793
vector<base_quality_t>::iterator qI=shredqual.begin();
1794
base_quality_t maxqual=AS_miraparams[0].getAssemblyParams().as_cap454consensusqual;
1795
for(;qI != shredqual.end(); qI++){
1796
if(*qI>maxqual) *qI=maxqual;
1801
ostr << "shred_" << shredcounter << "_" << sourceread.getName();
1802
shredname=ostr.str();
1804
AS_readpool.addNewEmptyRead();
1805
uint32 newreadid=AS_readpool.size()-1;
1806
Read & newread=AS_readpool.getRead(newreadid);
1807
newread.setName(shredname);
1808
newread.setSequenceFromString(shredseq);
1809
newread.setQualities(shredqual);
1810
newread.setStrain(shredstrain.c_str());
1811
newread.setSequencingType(shredreadtype);
1813
//cout << "\n----------------------------------------\nAdded " << shredname << '\n';
1814
// now insert the weights
1816
overlapfifo.push_front(newreadid);
1817
deque<uint32>::iterator OFI=overlapfifo.begin();
1819
int32 overlaplen=shredlen-shredoffsetinc;
1820
int32 totalshredoffset=shredoffsetinc;
1821
uint32 numelements=1;
1822
while(OFI != overlapfifo.end()) {
1823
if(overlaplen<=0) break;
1825
AlignedDualSeqFacts tmpadsf;
1829
static_cast<uint16>(totalshredoffset),
1830
static_cast<uint16>(totalshredoffset
1831
-(AS_readpool.getRead(*OFI).getLenSeq()
1832
-AS_readpool.getRead(newreadid).getLenSeq())),
1834
static_cast<uint16>((AS_readpool.getRead(*OFI).getLenSeq()+
1835
AS_readpool.getRead(newreadid).getLenSeq()-overlaplen)),
1840
// output of the ADSfacts to file
1842
// first weight and direction
1843
// TODO: reduce weight to favorise real reads in assembly???
1844
adsfout << overlaplen*10000 << "\t1\t";
1845
tmpadsf.serialiseOut(adsfout);
1848
AS_numADSFacts_fromshreds++;
1851
overlaplen-=shredoffsetinc;
1852
totalshredoffset+=shredoffsetinc;
1855
if(overlapfifo.size()>numelements) overlapfifo.resize(numelements);
1859
cout << "Shredded " << sourceread.getName() << " into " << shredcounter << " pieces.\n";
1834
FUNCSTART("void Assembly::priv_removePotentiallyWrongBaseInserts(Contig & con)");
1836
// try to look for strains which are only not in backbone
1837
vector<string> straincons(ReadGroupLib::getNumOfStrains());
1838
vector<base_quality_t> dummyqual;
1840
for(uint32 sid=0; sid<ReadGroupLib::getNumOfStrains(); ++sid){
1842
for(uint32 rgid=1; rgid<ReadGroupLib::getNumReadGroups();++rgid){
1843
auto rg=ReadGroupLib::getReadGroupID(rgid);
1844
if(rg.getStrainID()==sid
1852
con.newConsensusGet(straincons[sid],dummyqual,sid);
1856
// look if we have *any* consensus made (meaning we'd have same strain backbone AND mapped reads).
1857
// If not, recalc consensi for all strains
1859
bool needrecalc=true;
1860
for(auto & s : straincons){
1867
for(uint32 sid=0; sid<ReadGroupLib::getNumOfStrains(); ++sid){
1868
con.newConsensusGet(straincons[sid],dummyqual,sid);
1874
bool stillnocons=true;
1875
for(auto & s : straincons){
1881
BUGIFTHROW(stillnocons,"Ummmm ... no cons built???");
1884
// fill up empty consensi
1885
for(auto & s : straincons){
1886
if(s.empty()) s.resize(con.getContigLength(),'*');
1889
Contig::cccontainer_t & cc = const_cast<Contig::cccontainer_t &>(con.getConsensusCounts());
1891
auto ccI=cc.begin();
1892
for(uint32 actcontigpos=0; actcontigpos<straincons[0].size(); ++actcontigpos, ++ccI){
1893
bool maydelete=true;
1894
for(auto & s : straincons){
1895
if(s[actcontigpos]!='*'){
1901
ccI->total_cov=65535;
1905
con.deleteStarOnlyColumns(0,con.getContigLength(),false,65535);