713
708
CON_stats.max_coverage=max(CON_stats.max_coverage,
714
709
static_cast<uint32>(ccI->total_cov));
715
for(uint32 i=0; i<ReadGroupLib::SEQTYPE_END; i++){
710
for(uint32 i=0; i<ReadGroupLib::SEQTYPE_END; ++i){
711
CON_stats.totalbasesperst[i]+=static_cast<uint64>(ccI->seqtype_cov[i]);
716
712
CON_stats.max_covperst[i]=max(CON_stats.max_covperst[i],
717
713
static_cast<uint32>(ccI->seqtype_cov[i]));
720
// 100% mapped Solexa/SOLiD reads must be accounted for a bit differently
722
actcovsxa=static_cast<uint64>(ccI->seqtype_cov[ReadGroupLib::SEQTYPE_SOLEXA]);
723
sumcovsxa+=actcovsxa;
725
CON_stats.max_covperst[ReadGroupLib::SEQTYPE_SOLEXA]=
726
max(CON_stats.max_covperst[ReadGroupLib::SEQTYPE_SOLEXA],static_cast<uint32>(actcovsxa));
728
actcovsid=static_cast<uint64>(ccI->seqtype_cov[ReadGroupLib::SEQTYPE_ABISOLID]);
729
sumcovsid+=actcovsid;
731
CON_stats.max_covperst[ReadGroupLib::SEQTYPE_ABISOLID]=
732
max(CON_stats.max_covperst[ReadGroupLib::SEQTYPE_ABISOLID],static_cast<uint32>(actcovsid));
734
716
CON_stats.starInR+=ccI->star;
735
717
CON_stats.NinR+=ccI->N;
738
720
// ok, in the section above, rail reads would have been also counted in
739
721
// (in .coverage)
740
// compute reads numbers
722
// compute reads statistics and remove rail bases from base statistics
743
725
auto cI=CON_reads.begin();
744
726
for(;cI!=CON_reads.end();++cI){
728
CON_stats.totalbasesperst[cI->getSequencingType()]-=cI->getLenClippedSeq();
747
731
// Nono, backbones must be counted in!
748
732
// Scenario: assembly at 6x used as backbone, assembled further 1x
752
736
// && !cI->read.isBackbone()) {
754
CON_stats.total_reads++;
738
++CON_stats.total_reads;
755
739
CON_stats.readsperst[cI->getSequencingType()]++;
756
if(cI.getORPID()>=0){
757
CON_stats.readlenperst[cI->getSequencingType()]+=CON_readpool->getRead(cI.getORPID()).getLenClippedSeq();
758
if(cI->hasQuality()){
759
CON_stats.numreads_withqual++;
761
CON_stats.numreads_noqual++;
740
if(cI->hasQuality()){
741
CON_stats.numreads_withqual++;
764
CON_stats.readlenperst[cI->getSequencingType()]+=cI->getLenClippedSeq();
743
CON_stats.numreads_noqual++;
770
CON_stats.readlenperst[ReadGroupLib::SEQTYPE_SOLEXA]=static_cast<uint32>(sumcovsxa);
771
CON_stats.readlenperst[ReadGroupLib::SEQTYPE_ABISOLID]=static_cast<uint32>(sumcovsid);
749
CON_stats.avg_coverage=0.0;
773
750
if(!CON_counts.empty()){
774
CON_stats.avg_coverage=0.0;
775
for(uint32 i=0; i<ReadGroupLib::SEQTYPE_END; i++){
776
if(CON_stats.readlenperst[i]>0.0){
777
// subtracting the number of uncovered bases in the following firmula is a bad, bad idea and
778
// gives totally wrong numbers.
779
//CON_stats.avg_covperst[i]=static_cast<double>(CON_stats.readlenperst[i])/(CON_counts.size()- CON_stats.numnocoverage);
780
CON_stats.avg_covperst[i]=static_cast<double>(CON_stats.readlenperst[i])/(CON_counts.size());
781
CON_stats.avg_coverage+=CON_stats.avg_covperst[i];
752
for(uint32 sti=0; sti<ReadGroupLib::SEQTYPE_END; ++sti){
753
CON_stats.avg_covperst[sti]=static_cast<double>(CON_stats.totalbasesperst[sti])/CON_counts.size();
754
totalbases+=CON_stats.totalbasesperst[sti];
756
CON_stats.avg_coverage=static_cast<double>(totalbases)/CON_counts.size();
787
vector<base_quality_t>::const_iterator I=consqual.begin();
789
while(I!=consqual.end()){
790
qual+=static_cast<double>(*I);
761
for(auto & cqe : consqual) qual+=static_cast<double>(cqe);
793
762
CON_stats.avg_conqual=static_cast<uint16>((qual/consseq.size())+.5);
1141
1118
//#define BUGHUNT
1143
void Contig::addRead(vector<Align> & aligncache, const AlignedDualSeqFacts * initialadsf, int32 refid, int32 newid, int32 direction_frnid, bool newid_ismulticopy, int32 forcegrow, errorstatus_t & errstat)
1120
void Contig::addRead(vector<Align> & aligncache, const AlignedDualSeqFacts * initialadsf, int32 refid, int32 newid, int32 direction_frnid, bool newid_ismulticopy, int32 forcegrow, templateguessinfo_t & templateguess, errorstatus_t & errstat)
1145
FUNCSTART("void Contig::addRead(vector<Align> & aligncache, const AlignedDualSeqFacts * initialadsf, int32 refid, int32 newid, int32 direction_frnid, errorstatus_t & errstat)");
1122
FUNCSTART("void Contig::addRead(vector<Align> & aligncache, const AlignedDualSeqFacts * initialadsf, int32 refid, int32 newid, int32 direction_frnid, bool newid_ismulticopy, int32 forcegrow, templateguess_t & templateguess, errorstatus_t & errstat)");
1147
1124
//setCEBUGFlag(newid,refid);
1149
1126
paranoiaBUGIF(aligncache.size()!=ReadGroupLib::SEQTYPE_END,
1150
1127
MIRANOTIFY(Notify::INTERNAL, "aligncache is not size of available sequencing types???"));
1129
templateguess.rgid.resetLibId();
1152
1131
// this is ugly: save the align params because
1153
1132
// addRead_wrapped() might change some *sigh*
1154
1133
align_parameters oldalignparams=
1245
1230
#define CEBUG(bla)
1247
void Contig::addRead_wrapped(vector<Align> & aligncache, const AlignedDualSeqFacts * initialadsf, int32 refid, int32 newid, int32 direction_frnid, bool newid_ismulticopy, int32 forcegrow, errorstatus_t & errstat)
1232
void Contig::addRead_wrapped(vector<Align> & aligncache, const AlignedDualSeqFacts * initialadsf, int32 refid, int32 newid, int32 direction_frnid, bool newid_ismulticopy, int32 forcegrow, templateguessinfo_t & templateguess, errorstatus_t & errstat)
1249
1234
FUNCSTART("void Contig::addRead_wrapped(vector<Align> & aligncache, const AlignedDualSeqFacts * initialadsf, int32 refid, int32 newid, int32 direction_frnid, bool newid_ismulticopy, int32 forcegrow, errorstatus_t & errstat)");
1840
1825
bool havematchingtemplatepartner=false;
1842
if(CON_readpool->getRead(newid).hasTemplateInfo() > 0){
1827
if(CON_readpool->getRead(newid).getTemplatePartnerID() != -1){
1843
1828
// 1st: direction
1844
1829
#ifndef PUBLICQUIET
1845
1830
cout << " tmplhand1 ";
1846
1831
cout << CON_readpool->getRead(newid).getTemplatePartnerID() << " ";
1849
auto pcrI=CON_reads.getIteratorOfReadpoolID(CON_readpool->getRead(newid).getTemplatePartnerID());
1834
auto tppcrI=CON_reads.getIteratorOfReadpoolID(CON_readpool->getRead(newid).getTemplatePartnerID());
1850
1835
// is partner already in contig?
1851
if(pcrI==CON_reads.end()){
1836
if(tppcrI==CON_reads.end()){
1852
1837
#ifndef PUBLICQUIET
1839
<< CON_readpool->getRead(newid).getTemplatePartnerID();
1840
if(CON_readpool->getRead(newid).getTemplatePartnerID()>=0){
1841
cout << " - " << CON_readpool->getRead(CON_readpool->getRead(newid).getTemplatePartnerID()).getName();
1892
// for segment placement and template size check, we need the positions in the contig
1894
bool newreadisleft=true;
1896
// Note: play it safe while adding right extend ... take only
1900
if(direction_newid_incontig>0) {
1901
leftry+=CON_readpool->getRead(newid).getRightExtend()*2/3;
1903
leftrx-=CON_readpool->getRead(newid).getRightExtend()*2/3;
1906
int32 rightrx=tppcrI.getReadStartOffset();
1907
int32 rightry=tppcrI.getReadStartOffset()+tppcrI->getLenClippedSeq();
1908
if(direction_refid>0) {
1909
rightry+=tppcrI->getRightExtend()*2/3;
1911
rightrx-=tppcrI->getRightExtend()*2/3;
1914
if(rightrx<leftrx) {
1915
swap(rightrx,leftrx);
1916
swap(rightry,leftry);
1917
newreadisleft=false; // well, yeah, it is on the right
1920
int32 actinsertsize=rightry - leftrx;
1921
if(rightry<leftry) actinsertsize=leftry - leftrx;
1923
// we now have all info needed for storing measured template info
1924
bool guesstemplatesegplace=true;
1925
templateguess.tsize_seen=0; // just as default
1926
templateguess.splace_seen=ReadGroupLib::SPLACE_UNKNOWN; // just as default
1928
// Due to clipping at ends of reads,
1936
// one should only guess on disjunct (non-overlapping) reads
1937
// or if overlapping, the right read must start >= 10 bp
1938
// later than the left read
1941
if(leftrx+10>=rightrx){
1942
// and start of reads <= 10bp
1943
guesstemplatesegplace=false;
1947
if(guesstemplatesegplace){
1948
templateguess.rgid=CON_readpool->getRead(newid).getReadGroupID(); // this validates the entry (else its rgid: 0)
1949
templateguess.tsize_seen=actinsertsize;
1950
// placement code is hardest
1951
if(direction_newid_incontig*tppcrI.getReadDirection() < 0){
1954
if(direction_newid_incontig>0){
1955
templateguess.splace_seen=ReadGroupLib::SPLACE_FR;
1957
templateguess.splace_seen=ReadGroupLib::SPLACE_RF;
1960
if(direction_newid_incontig>0){
1961
templateguess.splace_seen=ReadGroupLib::SPLACE_RF;
1963
templateguess.splace_seen=ReadGroupLib::SPLACE_FR;
1967
// same direction ...
1969
// I'm sure that some boolean logic could get through this, but I'm not inclined to think about it now
1970
templateguess.splace_seen=ReadGroupLib::SPLACE_SU; // just as default
1971
if((direction_newid_incontig>0 && newreadisleft==true)
1972
|| (direction_newid_incontig<0 && newreadisleft==false)){
1973
if(CON_readpool->getRead(newid).getTemplateSegment()==1){
1974
templateguess.splace_seen=ReadGroupLib::SPLACE_SF;
1976
templateguess.splace_seen=ReadGroupLib::SPLACE_SB;
1979
if(CON_readpool->getRead(newid).getTemplateSegment()==1){
1980
templateguess.splace_seen=ReadGroupLib::SPLACE_SB;
1982
templateguess.splace_seen=ReadGroupLib::SPLACE_SF;
1988
cout << " nril " << newreadisleft
1989
<< " " << leftrx << "-" << leftry
1990
<< " " << rightrx << "-" << rightry
1991
<< " " << templateguess;
1902
1996
// ** Check for segment placement **
1903
if(CON_readpool->getRead(newid).getReadGroupID().getSegmentPlacementCode() == ReadGroupLib::SPLACE_SF
1904
|| CON_readpool->getRead(newid).getReadGroupID().getSegmentPlacementCode() == ReadGroupLib::SPLACE_SB){
1997
// unknown does obviously need no check, "samedir unknown" already checked implicitly by direction check above
1999
auto spc=CON_readpool->getRead(newid).getReadGroupID().getSegmentPlacementCode();
2000
if(spc != ReadGroupLib::SPLACE_UNKNOWN
2001
&& spc != ReadGroupLib::SPLACE_SU
2002
&& templateguess.splace_seen != ReadGroupLib::SPLACE_UNKNOWN
2003
&& templateguess.splace_seen != ReadGroupLib::SPLACE_SU){
2008
// if reads overlap, stay cool and don't check as it could be that sequencing errors and clipping
2009
// muddy the picture
2010
// That is: the +10 bp criterion of above is not taken into account here, only the non-overlapping part
2012
bool hasplacementerror=false;
2018
if(spc!=templateguess.splace_seen) hasplacementerror=true;
2021
if(hasplacementerror){
2022
errstat.code=ESEGMENTPLACEMENT;
2025
if(static_cast<uint32>(ycut)>CON_counts.size()){
2026
ycut=CON_counts.size();
2028
gettimeofday(&us_start,nullptr);
2029
if(CON_readpool->getRead(refid).isRail()) {
2030
getRailsAsReadsAffected(refid, errstat.reads_affected, xcut, ycut);
2032
getReadORPIDsAtContigPosition(errstat.reads_affected, xcut, ycut);
2034
CON_us_steps[USCLO_GRACP]+=diffsuseconds(us_start);
1911
2041
// ** Check for template size **
1919
// Note: play it safe while adding right extend ... take only
1923
if(direction_newid_incontig>0) {
1924
leftry+=CON_readpool->getRead(newid).getRightExtend()*3/2;
1926
leftrx-=CON_readpool->getRead(newid).getRightExtend()*3/2;
1929
int32 rightrx=pcrI.getReadStartOffset();
1930
int32 rightry=pcrI.getReadStartOffset()+pcrI->getLenClippedSeq();
1931
if(direction_refid>0) {
1932
rightry+=pcrI->getRightExtend()*3/2;
1934
rightrx-=pcrI->getRightExtend()*3/2;
1937
if(rightrx<leftrx) {
1938
swap(rightrx,leftrx);
1939
swap(rightry,leftry);
1942
int32 actinsertsize=rightry - leftrx;
1943
if(rightry<leftry) actinsertsize=leftry - leftrx;
1945
2049
// allow 15% error in calculated insert size
1946
2050
int32 aisp10=actinsertsize+actinsertsize*15/100;
1947
2051
int32 aism10=actinsertsize-actinsertsize*15/100;
1949
int32 tif=pcrI->getInsizeFrom();
1950
int32 tit=pcrI->getInsizeTo();
2053
int32 tif=tppcrI->getInsizeFrom();
2054
int32 tit=tppcrI->getInsizeTo();
1952
if(pcrI->getInsizeFrom() >=0 && aisp10 < tif){
2056
if(tppcrI->getInsizeFrom() >=0 && aisp10 < tif){
1953
2057
// distance too small
1954
2058
#ifndef PUBLICQUIET
1955
cout << "templ too small: " << tif << " min allowed, got " << aisp10;
2059
cout << "templ too small: " << tif << " min allowed, got " << aism10 << "-" << aisp10;
1957
2061
havematchingtemplatepartner=false;
1958
2062
if(!CON_readpool->getRead(newid).getReadGroupID().getTSInfoOnly()){
1959
errstat.code=ETEMPLATESIZE;
2063
errstat.code=ETEMPLATESIZELT;
1962
if(errstat.code != ETEMPLATESIZE && pcrI->getInsizeTo() >=0 && aism10 > tit){
2066
if(errstat.code == ENOERROR && tppcrI->getInsizeTo() >=0 && aism10 > tit){
1963
2067
// distance too big
1964
2068
#ifndef PUBLICQUIET
1965
cout << "templ too big: " << tit << " max allowed, got " << aism10;
2069
cout << "templ too big: " << tit << " max allowed, got " << aism10 << "-" << aisp10;
1967
2071
havematchingtemplatepartner=false;
1968
2072
if(!CON_readpool->getRead(newid).getReadGroupID().getTSInfoOnly()){
1969
errstat.code=ETEMPLATESIZE;
2073
errstat.code=ETEMPLATESIZEGT;
1972
if(errstat.code == ETEMPLATESIZE){
2076
if(errstat.code != ENOERROR){
1973
2077
if(xcut<0) xcut=0;
1974
2078
if(static_cast<uint32>(ycut)>CON_counts.size()){
1975
2079
ycut=CON_counts.size();
2788
2894
dist=CON_readpool->getRead(newid).getInsizeTo();
2791
// cout << "mpl: " << CON_markerpositions[CON_mpindex_msrkceu_left]
2792
// << "\tmpr: " << CON_markerpositions[CON_mpindex_msrkceu_right]
2793
// << "\nxcut: " << xcut
2794
// << "\tycut: " << ycut
2795
// << "\tdist: " << dist
2796
// << "\nl: " << dist+CON_markerpositions[CON_mpindex_msrkceu_left]
2797
// << "\tl: " << CON_markerpositions[CON_mpindex_msrkceu_right]-dist
2897
//cout << "mpl: " << CON_markerpositions[CON_mpindex_msrkceu_left]
2898
// << "\tmpr: " << CON_markerpositions[CON_mpindex_msrkceu_right]
2899
// << "\nxcut: " << xcut
2900
// << "\tycut: " << ycut
2901
// << "\tdist: " << dist
2902
// << "\nl: " << dist+CON_markerpositions[CON_mpindex_msrkceu_left]
2903
// << "\tl: " << CON_markerpositions[CON_mpindex_msrkceu_right]-dist
2800
2906
// if no distance or when read has no paired-end partner, we could map
2801
2907
// anyway and would not need to check further
2803
2909
// check whether we are in boundaries, if yes, do not map
2804
2910
if(xcut <= dist+CON_markerpositions[CON_mpindex_msrkceu_left]
2805
|| ycut >= CON_markerpositions[CON_mpindex_msrkceu_left]-dist){
2911
|| ycut >= CON_markerpositions[CON_mpindex_msrkceu_right]-dist){
2810
//cout << "\ncanmap: " << canmap << endl;
2916
//cout << "\ncanmap2: " << canmap << endl;
2813
2919
// map if possible, else the read will be normally added
4570
4676
if(CON_us_steps_drfc.size()) CON_us_steps_drfc[USCLODRFC_SDT]+=diffsuseconds(us_tv);
4571
4677
// push down the offsets of the other reads
4572
gettimeofday(&us_tv,nullptr);
4573
CON_reads.shiftReads(1,-frontdeletions);
4574
if(CON_us_steps_drfc.size()) CON_us_steps_drfc[USCLODRFC_SR]+=diffsuseconds(us_tv);
4679
gettimeofday(&us_tv,nullptr);
4680
CON_reads.shiftReads(1,-frontdeletions);
4681
if(CON_us_steps_drfc.size()) CON_us_steps_drfc[USCLODRFC_SR]+=diffsuseconds(us_tv);
4575
4683
// same thing for contig marker positions
4576
4684
gettimeofday(&us_tv,nullptr);
4577
4685
shiftMarkerPositions(1,-frontdeletions);
4653
4761
#define CEBUG(bla)
4764
/*************************************************************************
4769
*************************************************************************/
4770
//#define CEBUG(bla) {cout << bla; cout.flush();}
4771
void Contig::trimMapOverhang()
4773
FUNCSTART("void Contig::trimMapOverhang()");
4775
//cout << "MMMMMMMMMM\n";
4776
//CON_reads.debugDump(false);
4778
if(getNumBackbones()==0) return;
4780
int32 firstknownbbpos=-1;
4781
int32 lastknownbbpos=-1;
4782
for(auto crI= CON_reads.begin(); crI!=CON_reads.end(); ++crI){
4783
if(crI->isBackbone()){
4784
if(static_cast<int32>(crI.getReadStartOffset()+crI->getLenClippedSeq()) > lastknownbbpos){
4785
lastknownbbpos=crI.getReadStartOffset()+crI->getLenClippedSeq();
4787
if(firstknownbbpos<0){
4788
firstknownbbpos=crI.getReadStartOffset();
4793
CEBUG("fkbb: " << firstknownbbpos << endl);
4794
CEBUG("lkbb: " << lastknownbbpos << endl);
4795
CEBUG("gcl: " << getContigLength() << endl);
4797
BUGIFTHROW(firstknownbbpos<0,"firstknownbbpos " << firstknownbbpos << " <0 ???");
4798
BUGIFTHROW(lastknownbbpos<0,"lastknownbbpos " << lastknownbbpos << " <0 ???");
4800
// check: are there reads completely outside the backbone? if yes: no-can-do
4801
for(auto crI= CON_reads.begin(); crI!=CON_reads.end(); ++crI){
4802
if(!crI->isBackbone()){
4803
if(crI.getReadStartOffset() < firstknownbbpos
4804
&& crI.getReadStartOffset()+crI->getLenClippedSeq() <= firstknownbbpos){
4805
// oooops, impossible at front
4808
if(crI.getReadStartOffset() >= lastknownbbpos){
4809
// oooops, impossible at end
4815
bool docorrect=false;
4816
// we're fine, now get things rolling
4818
// first, the back of the contig if needed
4819
if(lastknownbbpos < getContigLength()){
4821
// zero out CON_counts
4822
auto ccI=CON_counts.end(); // cannot use rbegin, not implemented in HDeque
4823
CEBUG("numdelsteps: " << getContigLength()-lastknownbbpos << endl);
4824
for(auto numdelsteps=getContigLength()-lastknownbbpos; numdelsteps!=0; --numdelsteps){
4826
*ccI=CON_concounts_zero;
4829
// shorten overhanging reads
4830
for(auto crI= CON_reads.begin(); crI!=CON_reads.end(); ++crI){
4831
int32 shorten=static_cast<int32>(crI.getReadStartOffset()+crI->getLenClippedSeq())-lastknownbbpos;
4832
CEBUG(crI->getName() << " shortenb " << shorten << endl);
4834
// very hacky, but we're forcing our hand on the reads of the PCR container here
4835
Read & ncr=const_cast<Read &>(*crI);
4836
if(crI.getReadDirection()>0){
4837
ncr.setRQClipoff(crI->getRightClipoff()-shorten);
4839
ncr.setLQClipoff(crI->getLeftClipoff()+shorten);
4845
// now the front of the contig if needed
4846
if(firstknownbbpos>0){
4848
// zero out CON_counts
4849
auto ccI=CON_counts.begin();
4850
for(auto numdelsteps=firstknownbbpos; numdelsteps!=0; --numdelsteps, ++ccI){
4851
*ccI=CON_concounts_zero;
4854
// shorten overhanging reads
4855
for(auto crI= CON_reads.begin(); crI!=CON_reads.end(); ++crI){
4856
int32 shorten=firstknownbbpos-static_cast<int32>(crI.getReadStartOffset());
4857
CEBUG(crI->getName() << "\tfkbbp: " << firstknownbbpos << "\trso: " << crI.getReadStartOffset() <<"\tshortenf " << shorten << endl);
4859
// very hacky, but we're forcing our hand on the reads of the PCR container here
4860
Read & ncr=const_cast<Read &>(*crI);
4861
if(crI.getReadDirection()>0){
4862
ncr.setLQClipoff(crI->getLeftClipoff()+shorten);
4864
ncr.setRQClipoff(crI->getRightClipoff()-shorten);
4873
if(firstknownbbpos>0){
4875
chompFront(-1,false);
4877
CON_reads.shiftReadsBounceZero(0,-firstknownbbpos);
4881
//#define CEBUG(bla)
4657
4884
/*************************************************************************