~ubuntu-branches/ubuntu/trusty/mira/trusty-proposed

« back to all changes in this revision

Viewing changes to src/mira/hashstats.C

  • Committer: Package Import Robot
  • Author(s): Thorsten Alteholz, Thorsten Alteholz, Andreas Tille
  • Date: 2014-02-02 22:51:35 UTC
  • mfrom: (7.1.1 sid)
  • Revision ID: package-import@ubuntu.com-20140202225135-nesemzj59jjgogh0
Tags: 4.0-1
[ Thorsten Alteholz ]
* New upstream version 
* debian/rules: add boost dir in auto_configure (Closes: #735798)

[ Andreas Tille ]
* cme fix dpkg-control
* debian/patches/{make.patch,spelling.patch}: applied upstream (thus removed)

Show diffs side-by-side

added added

removed removed

Lines of Context:
24
24
#include <boost/thread/thread.hpp>
25
25
#include <boost/bind.hpp>
26
26
 
27
 
#include <iostream>
28
 
#include <math.h>
29
 
 
30
27
#include <boost/filesystem.hpp>
31
28
 
32
29
#include "errorhandling/errorhandling.H"
116
113
 *************************************************************************/
117
114
 
118
115
//#define CEBUG(bla)   {cout << bla; cout.flush();}
119
 
void HashStatistics::prepareHashStatistics(const string & directory, ReadPool & rp, bool checkusedinassembly, bool onlyagainstrails, bool alsosavesinglehashes, bool fwdandrev, uint32 fwdrevmin, uint8  basesperhash, string & hashstatfilename)
 
116
void HashStatistics::prepareHashStatistics(const string & directory, ReadPool & rp, bool checkusedinassembly, bool onlyagainstrails, bool alsosavesinglehashes, bool fwdandrev, uint32 fwdrevmin, uint8  basesperhash, uint32 millionhashesperbuffer, bool rarekmerearlykill, string & hashstatfilename)
120
117
{
121
118
  HS_readpoolptr=&rp;
122
119
  HS_hs_basesperhash=basesperhash;
129
126
  cout << "Writing temporary hstat files:\n";
130
127
  hashes2disk(hashfilenames,elementsperfile,
131
128
              rp,checkusedinassembly,fwdandrev,fwdrevmin,
132
 
              basesperhash,directory);
 
129
              basesperhash,
 
130
              millionhashesperbuffer,
 
131
              rarekmerearlykill,
 
132
              directory);
133
133
 
134
134
  dateStamp(cout);
135
135
 
191
191
 
192
192
//#define CEBUG(bla)   {cout << bla; cout.flush();}
193
193
 
194
 
void HashStatistics::hashes2disk(vector<string> & hashfilenames, vector<size_t> & elementsperfile, ReadPool & rp, bool checkusedinassembly, bool fwdandrev, uint32 fwdrevmin, uint8 basesperhash, const string & directory)
 
194
void HashStatistics::hashes2disk(vector<string> & hashfilenames, vector<size_t> & elementsperfile, ReadPool & rp, bool checkusedinassembly, bool fwdandrev, uint32 fwdrevmin, uint8 basesperhash, uint32 millionhashesperbuffer, bool rarekmerearlykill, const string & directory)
195
195
{
196
196
  FUNCSTART("void HashStatistics::hashes2disk(uint32 basesperhash)");
197
197
 
213
213
  // define how many elements to reserve
214
214
  if(HS_numelementsperbuffer==0){
215
215
    // default fallback values
216
 
    HS_numelementsperbuffer=1048576*16;      // 16m elements, 16b each, 16 buffers == 4 GiB
 
216
    HS_numelementsperbuffer=millionhashesperbuffer*1048576;    // 16m elements, 16b each, 16 buffers == 4 GiB
217
217
    if(sizeof(void *)==4){
218
218
      // on 32 bit systems, be careful with memory
219
219
      HS_numelementsperbuffer=1048576/2;
331
331
        elementsperfile[hashfilesindex]+=writeCompressedHFB(hashfilebuffer[hashfilesindex],
332
332
                                                            fwdrevmin,
333
333
                                                            hashfiles[hashfilesindex],
334
 
                                                            false);
 
334
                                                            false,
 
335
                                                            rarekmerearlykill);
335
336
#ifdef CLOCKSTEPS
336
337
        timeval after,diff;
337
338
        gettimeofday(&after,nullptr);
372
373
          elementsperfile[hashfilesindex]+=writeCompressedHFB(hashfilebuffer[hashfilesindex],
373
374
                                                              fwdrevmin,
374
375
                                                              hashfiles[hashfilesindex],
375
 
                                                              false);
 
376
                                                              false,
 
377
                                                              rarekmerearlykill);
376
378
#ifdef CLOCKSTEPS
377
379
          timeval after,diff;
378
380
          gettimeofday(&after,nullptr);
405
407
    elementsperfile[i]+=writeCompressedHFB(hashfilebuffer[i],
406
408
                                           fwdrevmin,
407
409
                                           hashfiles[i],
408
 
                                           true);
 
410
                                           true,
 
411
                                           rarekmerearlykill);
409
412
    fclose(hashfiles[i]);
410
413
  }
411
414
  P.finishAtOnce();
425
428
 *
426
429
 *************************************************************************/
427
430
 
428
 
size_t HashStatistics::writeCompressedHFB(vector<hashstat_t> & hfb, uint32 fwdrevmin, FILE * fileptr, bool force)
 
431
size_t HashStatistics::writeCompressedHFB(vector<hashstat_t> & hfb, uint32 fwdrevmin, FILE * fileptr, bool force, bool rarekmerearlykill)
429
432
{
430
433
  FUNCSTART("size_t HashStatistics::writeCompressedHFB(vector<hashstat_t> & hfb, FILE * fileptr)");
431
434
  size_t retvalue=0;
432
435
  if(hfb.size()){
433
 
    compressHashStatBufferInPlace(hfb, fwdrevmin, true);
 
436
    compressHashStatBufferInPlace(hfb, fwdrevmin, !rarekmerearlykill);
434
437
    if(force || hfb.size()>=hfb.capacity()*2/3){
435
438
      CEBUG("Write buffer " << &hfb << " " << 100*hfb.size()/hfb.capacity() << endl);
436
439
      if(myFWrite(&(hfb[0]),sizeof(hashstat_t),hfb.size(),fileptr) != hfb.size()){
468
471
 *
469
472
 *************************************************************************/
470
473
 
471
 
inline bool Skim__sortDiskNewHashComparator_(const hashstat_t & a,
 
474
inline bool HashStat__sortDiskNewHashComparator_(const hashstat_t & a,
472
475
                                             const hashstat_t & b);
473
 
inline bool Skim__sortDiskNewHashComparator_(const hashstat_t & a,
 
476
inline bool HashStat__sortDiskNewHashComparator_(const hashstat_t & a,
474
477
                                          const hashstat_t & b)
475
478
{
476
479
  if(a.vhash==b.vhash){
485
488
 *
486
489
 *************************************************************************/
487
490
 
488
 
inline bool Skim__sortHashStatComparatorByCount_(const hashstat_t & a,
 
491
inline bool HashStat__sortHashStatComparatorByCount_(const hashstat_t & a,
489
492
                                                 const hashstat_t & b);
490
 
inline bool Skim__sortHashStatComparatorByCount_(const hashstat_t & a,
 
493
inline bool HashStat__sortHashStatComparatorByCount_(const hashstat_t & a,
491
494
                                                 const hashstat_t & b)
492
495
{
493
496
  return a.count < b.count;
514
517
  tvtotal=tv;
515
518
#endif
516
519
 
517
 
  sort(hsb.begin(), hsb.end(), Skim__sortDiskNewHashComparator_);
 
520
  sort(hsb.begin(), hsb.end(), HashStat__sortDiskNewHashComparator_);
518
521
  CEBUG("done.\n");
519
522
  TEBUG("\nTiming sort HFB: " << diffsuseconds(tv) << endl);
520
523
 
774
777
    }
775
778
  }
776
779
 
777
 
  sort(HS_hs_hashstats.begin(),HS_hs_hashstats.end(),Skim__sortHashStatComparatorByCount_);
 
780
  sort(HS_hs_hashstats.begin(),HS_hs_hashstats.end(),HashStat__sortHashStatComparatorByCount_);
778
781
 
779
782
  if(HS_logflag_hashcount){
780
783
    string logfile=hashstatfilename+".shouldneverbeseen.hashcount.sort";
791
794
  calcAvgHashFreq();
792
795
 
793
796
  makeHashStatArrayShortcuts();
794
 
 
795
797
}
796
798
 
797
799
 
993
995
 *
994
996
 *************************************************************************/
995
997
 
996
 
inline bool Skim__sortHashStatComparatorByLow24bit_(const hashstat_t & a,
997
 
                                                    const hashstat_t & b);
998
 
inline bool Skim__sortHashStatComparatorByLow24bit_(const hashstat_t & a,
 
998
inline bool HashStat__sortHashStatComparatorByLow24bit_(const hashstat_t & a,
999
999
                                                    const hashstat_t & b)
1000
1000
{
1001
1001
  if((a.vhash & SKIM3_MAXVHASHMASK) != (b.vhash & SKIM3_MAXVHASHMASK)) {
1004
1004
  return a.vhash < b.vhash;
1005
1005
}
1006
1006
 
 
1007
/*************************************************************************
 
1008
 *
 
1009
 * sorter to sort from low to high, using a definable mask
 
1010
 *
 
1011
 *
 
1012
 *************************************************************************/
 
1013
 
 
1014
vhash_t HashStatistics__vhashmask;
 
1015
 
 
1016
inline bool HashStatistics__sortHashStatComparatorByMask_(const hashstat_t & a,
 
1017
                                                          const hashstat_t & b)
 
1018
{
 
1019
  if((a.vhash & HashStatistics__vhashmask) != (b.vhash & HashStatistics__vhashmask)) {
 
1020
    return (a.vhash & HashStatistics__vhashmask) < (b.vhash & HashStatistics__vhashmask);
 
1021
  }
 
1022
  return a.vhash < b.vhash;
 
1023
}
 
1024
 
 
1025
/*************************************************************************
 
1026
 *
 
1027
 * ideally, instead of using mincount, it should be mincount per
 
1028
 *  direction. NHashStatistics will do that ...
 
1029
 *
 
1030
 *************************************************************************/
 
1031
 
 
1032
string laberbla;
 
1033
 
 
1034
void HashStatistics::calcKMerForks(uint32 mincount)
 
1035
{
 
1036
  if(HS_hs_hashstats.empty()) return;
 
1037
 
 
1038
  for(auto & hse : HS_hs_hashstats) hse.iskmerfork=false;
 
1039
 
 
1040
  if(HS_hs_basesperhash<17) return;
 
1041
 
 
1042
  auto rollbases=HS_hs_basesperhash-1;
 
1043
  HashStatistics__vhashmask=1;
 
1044
  /* *grml* undefined behaviour of left shift for 64 shifts in a 64 bit type makes this cludge necessary */
 
1045
  /* the same for 32 shift in 32 bit types etc.pp */
 
1046
  if(HS_hs_basesperhash>=sizeof(vhash_t)*4){
 
1047
    HashStatistics__vhashmask=0;
 
1048
  }else{
 
1049
    HashStatistics__vhashmask<<=(rollbases*2);
 
1050
  }
 
1051
 
 
1052
  // vhashmask is now, e.g. for bph=31, 00010000000....
 
1053
  --HashStatistics__vhashmask;
 
1054
  // vhashmask is now, e.g. for bph=31, 000011111....
 
1055
 
 
1056
  // calc the status on ?..............
 
1057
  sort(HS_hs_hashstats.begin(), HS_hs_hashstats.end(), HashStatistics__sortHashStatComparatorByMask_);
 
1058
  laberbla="fwd ";
 
1059
  ckmf_helper(HashStatistics__vhashmask,mincount);
 
1060
 
 
1061
  // calc the status on ..............?
 
1062
  HashStatistics__vhashmask<<=2;
 
1063
  // vhashmask is now, e.g. for bph=31, 0011111....00
 
1064
  sort(HS_hs_hashstats.begin(), HS_hs_hashstats.end(),  HashStatistics__sortHashStatComparatorByMask_);
 
1065
 
 
1066
  laberbla="rev ";
 
1067
  ckmf_helper(HashStatistics__vhashmask,mincount);
 
1068
 
 
1069
  sort(HS_hs_hashstats.begin(), HS_hs_hashstats.end(), HashStat__sortHashStatComparatorByLow24bit_);
 
1070
}
 
1071
 
 
1072
//#define CEBUG(bla)   {cout << bla; cout.flush();}
 
1073
void HashStatistics::ckmf_helper(vhash_t hashmask, uint32 mincount)
 
1074
{
 
1075
  auto hsI=HS_hs_hashstats.begin();
 
1076
  auto hsJ=hsI+1;
 
1077
 
 
1078
  CEBUG("KMER DUMP:\n");
 
1079
 
 
1080
  for(; hsJ!=HS_hs_hashstats.end(); ++hsI,++hsJ){
 
1081
    if(hsI->hasfwdrevthresholdok
 
1082
       && hsJ->hasfwdrevthresholdok
 
1083
       && (hsI->vhash&hashmask) == (hsJ->vhash&hashmask)
 
1084
       && hsI->vhash != hsJ->vhash
 
1085
       && hsI->count >= mincount
 
1086
       && hsJ->count >= mincount){
 
1087
      hsI->iskmerfork=true;
 
1088
      hsJ->iskmerfork=true;
 
1089
    }
 
1090
    CEBUG(laberbla << NHashStatistics::hash2string(hsI->vhash,HS_hs_basesperhash) << ' ' << *hsI << '\n');
 
1091
    CEBUG(laberbla << NHashStatistics::hash2string(hsJ->vhash,HS_hs_basesperhash) << ' ' << *hsJ << '\n');
 
1092
  }
 
1093
}
 
1094
//#define CEBUG(bla)
 
1095
 
1007
1096
 
1008
1097
/*************************************************************************
1009
1098
 *
1032
1121
    CEBUG(HS_hs_hashstats[i] << '\n');
1033
1122
  }
1034
1123
 
1035
 
  sort(HS_hs_hashstats.begin(), HS_hs_hashstats.end(), Skim__sortHashStatComparatorByLow24bit_);
 
1124
  sort(HS_hs_hashstats.begin(), HS_hs_hashstats.end(), HashStat__sortHashStatComparatorByLow24bit_);
1036
1125
 
1037
1126
  HS_hs_hsshortcuts.clear();
1038
1127
  {
1075
1164
 *
1076
1165
 *************************************************************************/
1077
1166
 
1078
 
inline bool Skim__compareHashStatHashElem_(const hashstat_t & a,
1079
 
                                           const hashstat_t & b);
1080
 
inline bool Skim__compareHashStatHashElem_(const hashstat_t & a,
1081
 
                                           const hashstat_t & b)
 
1167
inline bool HashStat__compareHashStatHashElem_(const hashstat_t & a,
 
1168
                                               const hashstat_t & b);
 
1169
inline bool HashStat__compareHashStatHashElem_(const hashstat_t & a,
 
1170
                                               const hashstat_t & b)
1082
1171
{
1083
1172
  return a.vhash < b.vhash;
1084
1173
}
1085
1174
 
1086
1175
//#define CEBUG(bla)   {cout << bla; cout.flush();}
1087
1176
 
1088
 
/*************************************************************************
1089
 
 *
1090
 
 *
1091
 
 *
1092
 
 *
1093
 
 *
1094
 
 *************************************************************************/
1095
 
 
1096
 
/*
1097
 
void HashStatistics::assignReadBaseStatistics(ReadPool & rp, size_t avghashcov, vector<hashstat_t> & hashstats, const uint8 basesperhash, vector<vector<hashstat_t>::const_iterator > & hsshortcuts_begin, vector<vector<hashstat_t>::const_iterator > & hsshortcuts_end, bool masknastyrepeats, vector<uint32> & rarekmermasking)
1098
 
{
1099
 
  FUNCSTART("void HashStatistics::assignReadBaseStatistics(ReadPool & rp, size_t avghashcov, vector<hashstat_t> & hashstats, const uint8 basesperhash, vector<vector<hashstat_t>::const_iterator > & hsshortcuts_begin, vector<vector<hashstat_t>::const_iterator > & hsshortcuts_end)");
1100
 
 
1101
 
  if(rarekmermasking.size() != ReadGroupLib::getNumSequencingTypes()){
1102
 
    rarekmermasking.clear();
1103
 
    rarekmermasking.resize(ReadGroupLib::getNumSequencingTypes(),0);
1104
 
  }
1105
 
 
1106
 
  uint32 minnormalhashcov=static_cast<uint32>(static_cast<double>(avghashcov)*HS_freqest_minnormal);
1107
 
  uint32 maxnormalhashcov=static_cast<uint32>(static_cast<double>(avghashcov)*HS_freqest_maxnormal);
1108
 
  uint32 repeathashcov=static_cast<uint32>(static_cast<double>(avghashcov)*HS_freqest_repeat);
1109
 
  uint32 heavyrepthashcov=static_cast<uint32>(static_cast<double>(avghashcov)*HS_freqest_heavyrepeat);
1110
 
  uint32 crazyrepthashcov=static_cast<uint32>(static_cast<double>(avghashcov)*HS_freqest_crazyrepeat);
1111
 
  uint32 maskhashcov=static_cast<uint32>(static_cast<double>(avghashcov)*HS_nastyrepeatratio);
1112
 
 
1113
 
  CEBUG("minnormalhashcov: " << minnormalhashcov << endl);
1114
 
  CEBUG("maxnormalhashcov: " << maxnormalhashcov << endl);
1115
 
  CEBUG("repeathashcov: " << repeathashcov << endl);
1116
 
 
1117
 
  vector<vhrap_t> singlereadvhraparray;
1118
 
  singlereadvhraparray.reserve(10000);
1119
 
 
1120
 
  // we will not use a mask, but
1121
 
  //  we need to supply an empty one anyway
1122
 
  vector<uint8> tagmaskvector;
1123
 
 
1124
 
  // stores in each read whether the given hash frequency was seen
1125
 
  vector<uint8> hasfrequency(8);
1126
 
 
1127
 
  vector<uint8> mcmask;
1128
 
  mcmask.reserve(10000);
1129
 
 
1130
 
  ProgressIndicator<int32> P(0, rp.size());
1131
 
 
1132
 
  multitag_t tmpmt(Read::REA_defaulttag_MNRr);
1133
 
 
1134
 
  for(uint32 actreadid=0; actreadid<rp.size(); actreadid++){
1135
 
    P.progress(actreadid);
1136
 
 
1137
 
    //if(actreadid>100) return;
1138
 
 
1139
 
    Read & actread= rp.getRead(actreadid);
1140
 
    if(!actread.hasValidData()
1141
 
      || !actread.isUsedInAssembly()) continue;
1142
 
 
1143
 
    // get rid of old values
1144
 
    actread.clearAllBPosHashStats();
1145
 
 
1146
 
//#define CEBUG(bla)   {if(cebugok) cout << bla; cout.flush();}
1147
 
//    bool cebugok=false;
1148
 
//    if(actread.getName()=="E0K6C4E01CTNQI") cebugok=true;
1149
 
 
1150
 
    uint32 slen=actread.getLenClippedSeq();
1151
 
 
1152
 
    if(slen<basesperhash) continue;
1153
 
 
1154
 
    mcmask.clear();
1155
 
    mcmask.resize(actread.getLenSeq(),0);
1156
 
 
1157
 
    hasfrequency.clear();
1158
 
    hasfrequency.resize(8,0);
1159
 
 
1160
 
    CEBUG("name: " << actread.getName() << '\n');
1161
 
 
1162
 
    //cout << "Before ...\n";
1163
 
    //Read::setCoutType(Read::AS_TEXT);
1164
 
    //cout << actread;
1165
 
 
1166
 
    singlereadvhraparray.resize(slen);
1167
 
    tagmaskvector.resize(slen,0);
1168
 
 
1169
 
    vector<vhrap_t>::iterator srvaI=singlereadvhraparray.begin();
1170
 
 
1171
 
    vector<Read::bposhashstat_t> & bposhashstats=const_cast<vector<Read::bposhashstat_t> &>(actread.getBPosHashStats());
1172
 
    uint32 hashesmade;
1173
 
 
1174
 
    {
1175
 
      int32 bfpos=actread.calcClippedPos2RawPos(0);
1176
 
      int32 bfposinc=1;
1177
 
 
1178
 
      hashesmade=Skim::transformSeqToVariableHash(
1179
 
        actreadid,
1180
 
        actread,
1181
 
        actread.getClippedSeqAsChar(),
1182
 
        slen,
1183
 
        basesperhash,
1184
 
        srvaI,
1185
 
        false,
1186
 
        1,
1187
 
        tagmaskvector,
1188
 
        bposhashstats,
1189
 
        bfpos,
1190
 
        bfposinc
1191
 
        );
1192
 
    }
1193
 
    singlereadvhraparray.resize(hashesmade);
1194
 
 
1195
 
    CEBUG("hashesmade: " << hashesmade << endl);
1196
 
 
1197
 
    vector<hashstat_t>::const_iterator lowerbound;
1198
 
    vector<hashstat_t>::const_iterator upperbound;
1199
 
 
1200
 
    vector<hashstat_t>::const_iterator hssearchI;
1201
 
    srvaI=singlereadvhraparray.begin();
1202
 
 
1203
 
    uint32 rarekmercount=rarekmermasking[actread.getSequencingType()];
1204
 
    //uint32 rarekmercount=30;
1205
 
 
1206
 
    int32 bfpos1,bfpos2;
1207
 
    hashstat_t hstmp;
1208
 
    bool foundit;
1209
 
    for(; srvaI != singlereadvhraparray.end(); srvaI++){
1210
 
      CEBUG(*srvaI << '\n');
1211
 
 
1212
 
      foundit=false;
1213
 
      lowerbound=hsshortcuts_begin[srvaI->vhash & SKIM3_MAXVHASHMASK];
1214
 
      upperbound=hsshortcuts_end[srvaI->vhash & SKIM3_MAXVHASHMASK];
1215
 
 
1216
 
      // "HS_empty_vector_hashstat_t.end()" is the "nullptr" replacement
1217
 
      if(hashstats.end() != lowerbound){
1218
 
        if(basesperhash>12){
1219
 
          // with more than 12 bases in a hash, the array is subdivided
1220
 
          hstmp.vhash=srvaI->vhash;
1221
 
          hssearchI=lower_bound(lowerbound,
1222
 
                                upperbound,
1223
 
                                hstmp,
1224
 
                                Skim__compareHashStatHashElem_);
1225
 
          if(hssearchI != hashstats.end()
1226
 
             && hssearchI->vhash == srvaI->vhash) foundit=true;
1227
 
        }else{
1228
 
          hssearchI=lowerbound;
1229
 
          foundit=true;
1230
 
        }
1231
 
      }else{
1232
 
        CEBUG("---------- NO LB HIT??? -------\n");
1233
 
      }
1234
 
 
1235
 
      if(foundit) {
1236
 
        CEBUG("VHRAP: " << *srvaI << '\n');
1237
 
        CEBUG("HashStat: " << *hssearchI << '\n');
1238
 
        CEBUG("srvaI->hashpos: " << srvaI->hashpos << '\n');
1239
 
 
1240
 
        bfpos1=actread.calcClippedPos2RawPos(srvaI->hashpos-(basesperhash-1));
1241
 
        bfpos2=bfpos1+basesperhash-1;
1242
 
 
1243
 
        CEBUG("b bfpos1: " << bfpos1 << '\t' << bposhashstats[bfpos1] << endl);
1244
 
        CEBUG("b bfpos2: " << bfpos2 << '\t' << bposhashstats[bfpos2] << endl);
1245
 
 
1246
 
        bposhashstats[bfpos1].fwd.setValid();
1247
 
        bposhashstats[bfpos2].rev.setValid();
1248
 
 
1249
 
        if(hssearchI->hasfwdrevthresholdok) {
1250
 
          //bhs|=Read::BFLAGS_CONFIRMED_FWDREV;
1251
 
          CEBUG("Set ConfFWDREV\n");
1252
 
          bposhashstats[bfpos1].fwd.setConfirmedFwdRev();
1253
 
          bposhashstats[bfpos2].rev.setConfirmedFwdRev();
1254
 
        }
1255
 
        if(hssearchI->lowpos<=4){
1256
 
          //bhs|=Read::BFLAGS_SEENATLOWPOS;
1257
 
          CEBUG("Set SeenAtLowPos\n");
1258
 
          bposhashstats[bfpos1].fwd.setSeenAtLowPos();
1259
 
          bposhashstats[bfpos2].rev.setSeenAtLowPos();
1260
 
        }
1261
 
        if(hssearchI->hasmultipleseqtype){
1262
 
          //bhs|=Read::BFLAGS_CONFIRMED_MULTIPLESEQTYPE;
1263
 
          CEBUG("Set ConfMultSeqType\n");
1264
 
          bposhashstats[bfpos1].fwd.setConfirmedMultipleSeqType();
1265
 
          bposhashstats[bfpos2].rev.setConfirmedMultipleSeqType();
1266
 
        }
1267
 
        uint8 frequency=2;
1268
 
        if(hssearchI->count == 1){
1269
 
          frequency=1;
1270
 
        }else if(hssearchI->count <= rarekmercount ){
1271
 
          // maybe additional checks ... ?
1272
 
          frequency=1;
1273
 
        }else if(hssearchI->count<minnormalhashcov) {
1274
 
          frequency=2;
1275
 
        }else if(hssearchI->count>=minnormalhashcov
1276
 
           && hssearchI->count<=maxnormalhashcov) {
1277
 
          frequency=3;
1278
 
          //}else if(hssearchI->count > minnormalhashcov*20){
1279
 
        }else if(hssearchI->count > crazyrepthashcov){
1280
 
          frequency=7;
1281
 
        }else if(hssearchI->count > heavyrepthashcov){
1282
 
          frequency=6;
1283
 
        }else if(hssearchI->count>=repeathashcov){
1284
 
          frequency=5;
1285
 
        }else{
1286
 
          frequency=4;
1287
 
        }
1288
 
        CEBUG("Set frequency: " << static_cast<uint16>(frequency) << endl);
1289
 
 
1290
 
        if(maskhashcov>0 && hssearchI->count>=maskhashcov){
1291
 
          for(uint32 j=0; j<basesperhash; j++){
1292
 
            mcmask[bfpos1+j]=1;
1293
 
          }
1294
 
        }
1295
 
 
1296
 
        CEBUG("a1 bfpos1: " << bfpos1 << '\t' << bposhashstats[bfpos1] << endl);
1297
 
        CEBUG("a1 bfpos2: " << bfpos2 << '\t' << bposhashstats[bfpos2] << endl);
1298
 
 
1299
 
        bposhashstats[bfpos1].fwd.setFrequency(frequency);
1300
 
        bposhashstats[bfpos2].rev.setFrequency(frequency);
1301
 
 
1302
 
        CEBUG("a2 bfpos1: " << bfpos1 << '\t' << bposhashstats[bfpos1] << endl);
1303
 
        CEBUG("a2 bfpos2: " << bfpos2 << '\t' << bposhashstats[bfpos2] << endl);
1304
 
 
1305
 
        hasfrequency[frequency]=1;
1306
 
 
1307
 
        //cout.flush();
1308
 
        //actread.setBaseFlagsInClippedSequence(bhs,
1309
 
        //                                    srvaI->hashpos-(basesperhash-1),
1310
 
        //                                    basesperhash);
1311
 
        //actread.setHasBaseFlags(true);
1312
 
      }
1313
 
    }
1314
 
 
1315
 
    actread.setHasFreqAvg(false);
1316
 
    actread.setHasFreqRept(false);
1317
 
 
1318
 
    if(hasfrequency[3]){
1319
 
      actread.setHasFreqAvg(true);
1320
 
    }
1321
 
    if(hasfrequency[5] || hasfrequency[6] || hasfrequency[7]){
1322
 
      actread.setHasFreqRept(true);
1323
 
    }
1324
 
 
1325
 
    actread.setHasBaseHashStats(true);
1326
 
 
1327
 
    //cout << "After ...\n";
1328
 
    //Read::setCoutType(Read::AS_TEXT);
1329
 
    //cout << actread;
1330
 
 
1331
 
 
1332
 
    // BaCh 07.04.2009 Bad Idea!!!
1333
 
    // BaCh 12.07.2009 Why? Forgot ... :-(
1334
 
    //// the fwd/rev of a read now looks like this (e.g.)
1335
 
    //// (for better viewing dot == 0)
1336
 
    ////
1337
 
    //// f   ..........2222222233333....355555....................
1338
 
    //// r   ................2222222....33333355555...............
1339
 
    ////
1340
 
    //// in dubio pro reo and to allow for potential matches,
1341
 
    //// do this:
1342
 
    ////
1343
 
    //// f   ..........2222222233333....355555->..................
1344
 
    //// r   ..............<-2222222....33333355555...............
1345
 
    ////
1346
 
    //// so that this
1347
 
    ////
1348
 
    //// f   ..........2222222233333....35555555555...............
1349
 
    //// r   ..........2222222222222....33333355555...............
1350
 
    ////
1351
 
    //// is generated
1352
 
    ////
1353
 
    ////
1354
 
    //
1355
 
    //{
1356
 
    //  uint32 bfposi=0;
1357
 
    //  for(; bfposi<bposhashstats.size() && bposhashstats[bfposi].fwd.getFrequency()==0; bfposi++) {};
1358
 
    //  uint32 bfpose=bfposi;
1359
 
    //  for(; bfpose<bposhashstats.size() && bposhashstats[bfpose].rev.getFrequency()==0; bfpose++) {};
1360
 
    //  if(bfposi<bposhashstats.size() && bfpose<bposhashstats.size()){
1361
 
    //  for(uint32 i=bfposi; i<bfpose; i++){
1362
 
    //    bposhashstats[i].fwd=bposhashstats[bfpose].rev;
1363
 
    //  }
1364
 
    //  }
1365
 
    //
1366
 
    //  bfposi=bposhashstats.size()-1;
1367
 
    //  for(; bfposi>0 && bposhashstats[bfposi].rev.getFrequency()==0; bfposi--) {};
1368
 
    //  bfpose=bfposi;
1369
 
    //  for(; bfpose>0 && bposhashstats[bfpose].fwd.getFrequency()==0; bfpose--) {};
1370
 
    //  if(bfposi>0){
1371
 
    //  for(uint32 i=bfposi; i>bfpose; i--){
1372
 
    //    bposhashstats[i].fwd=bposhashstats[bfpose].rev;
1373
 
    //  }
1374
 
    //  }
1375
 
    //}
1376
 
 
1377
 
 
1378
 
    // go through multicopy array and set MNRr tags for
1379
 
    //  consecutive positions in read tagged as multicopy
1380
 
    if(masknastyrepeats){
1381
 
      bool inrun=false;
1382
 
      uint32 runstart=0;
1383
 
      uint32 pos=0;
1384
 
      for(; pos<mcmask.size(); pos++){
1385
 
        CEBUG("pos: " << pos << '\t' << static_cast<uint16>(mcmask[pos]) << '\t' << inrun << '\n');
1386
 
        if(mcmask[pos]){
1387
 
          if(!inrun){
1388
 
            runstart=pos;
1389
 
            inrun=true;
1390
 
          }
1391
 
        }else{
1392
 
          if(inrun){
1393
 
            CEBUG("reprun " << actread.getName() << '\t' << runstart << '\t' << pos-1 << endl);
1394
 
            tmpmt.from=runstart;
1395
 
            tmpmt.to=pos-1;
1396
 
            actread.addTagO(tmpmt);
1397
 
            inrun=false;
1398
 
          }
1399
 
        }
1400
 
      }
1401
 
      if(inrun){
1402
 
        CEBUG("reprun " << actread.getName() << '\t' << runstart << '\t' << pos-1 << endl);
1403
 
        tmpmt.from=runstart;
1404
 
        tmpmt.to=pos-1;
1405
 
        actread.addTagO(tmpmt);
1406
 
      }
1407
 
    }
1408
 
 
1409
 
 
1410
 
 
1411
 
  }
1412
 
 
1413
 
  P.finishAtOnce();
1414
 
  cout << '\n';
1415
 
 
1416
 
  //cout << "\nskim Needs redo!\n";
1417
 
  //exit(0);
1418
 
 
1419
 
  FUNCEND();
1420
 
}
1421
 
*/
1422
 
//#define CEBUG(bla)
1423
 
 
1424
1177
 
1425
1178
 
1426
1179
/*************************************************************************
1433
1186
 
1434
1187
 //#define CEBUG(bla)   {cout << bla; cout.flush();}
1435
1188
 //void HashStatistics::assignReadBaseStatistics_MultiThread(uint32 numthreads, ReadPool & rp, size_t avghashcov, vector<hashstat_t> & hashstats, const uint8 basesperhash, vector<vector<hashstat_t>::const_iterator > & hsshortcuts_begin, vector<vector<hashstat_t>::const_iterator > & hsshortcuts_end, bool masknastyrepeats, vector<uint32> & rarekmermasking)
1436
 
void HashStatistics::assignReadBaseStatistics_MultiThread(uint32 numthreads, bool masknastyrepeats, vector<uint32> & rarekmermasking)
 
1189
void HashStatistics::assignReadBaseStatistics_MultiThread(uint32 numthreads, bool masknastyrepeats, vector<uint32> & rarekmermasking, uint32 mincountkmerforks)
1437
1190
{
1438
1191
  FUNCSTART("void HashStatistics::assignReadBaseStatistics_MultiThread(uint32 numthreads, bool masknastyrepeats, vector<uint32> & rarekmermasking)");
1439
1192
 
 
1193
  if(mincountkmerforks>0) calcKMerForks(mincountkmerforks);
 
1194
 
1440
1195
  if(rarekmermasking.size() != ReadGroupLib::getNumSequencingTypes()){
1441
1196
    rarekmermasking.clear();
1442
1197
    rarekmermasking.resize(ReadGroupLib::getNumSequencingTypes(),0);
1571
1326
    actread.clearAllBPosHashStats();
1572
1327
    actread.setHasFreqAvg(false);
1573
1328
    actread.setHasFreqRept(false);
 
1329
    actread.setHasKMerFork(false);
1574
1330
    actread.deleteTag(tmpmt.identifier);
1575
1331
 
1576
1332
    // whatever happens: this read was looked upon by this routine, so technically we "have" base hashstats
1639
1395
 
1640
1396
    int32 bfpos1,bfpos2;
1641
1397
    hashstat_t hstmp;
1642
 
    bool foundit;
 
1398
    bool foundit=false;
 
1399
    bool haskmerfork=false;
1643
1400
    for(; srvaI != singlereadvhraparray.end(); srvaI++){
1644
1401
      CEBUG(*srvaI << '\n');
1645
1402
 
1654
1411
          hssearchI=lower_bound(lowerbound,
1655
1412
                                hsshortcuts[srvaI->vhash & SKIM3_MAXVHASHMASK].e,
1656
1413
                                hstmp,
1657
 
                                Skim__compareHashStatHashElem_);
 
1414
                                HashStat__compareHashStatHashElem_);
1658
1415
          if(hssearchI != hashstats.end()
1659
1416
             && hssearchI->vhash == srvaI->vhash) foundit=true;
1660
1417
        }else{
1685
1442
          bposhashstats[bfpos1].fwd.setConfirmedFwdRev();
1686
1443
          bposhashstats[bfpos2].rev.setConfirmedFwdRev();
1687
1444
        }
 
1445
        if(hssearchI->iskmerfork) {
 
1446
          haskmerfork=true;
 
1447
          bposhashstats[bfpos1].fwd.setKMerFork();
 
1448
          bposhashstats[bfpos2].rev.setKMerFork();
 
1449
        }
1688
1450
        if(hssearchI->lowpos<=4){
1689
1451
          //bhs|=Read::BFLAGS_SEENATLOWPOS;
1690
1452
          CEBUG("Set SeenAtLowPos\n");
1752
1514
    if(hasfrequency[5] || hasfrequency[6] || hasfrequency[7]){
1753
1515
      actread.setHasFreqRept(true);
1754
1516
    }
 
1517
    actread.setHasKMerFork(haskmerfork);
1755
1518
 
1756
1519
    //Read::setCoutType(Read::AS_TEXT);
1757
1520
    //CEBUG("### After ...\n" << actread << endl);
3102
2865
 *
3103
2866
 *************************************************************************/
3104
2867
 
3105
 
void NHashStatistics::hash2string(vhash_t hash, std::string & str)
 
2868
void NHashStatistics::hash2string(vhash_t hash, uint8 basesperhash, std::string & str)
3106
2869
{
3107
2870
  static char acgtc[4]={'A','C','G','T'};
3108
2871
 
3109
2872
  str.clear();
3110
 
  str.resize(HSN_basesperhash,' ');
 
2873
  str.resize(basesperhash,' ');
3111
2874
  auto srI=str.rbegin();
3112
 
  for(auto ci=0; ci<HSN_basesperhash; ++ci, ++srI){
 
2875
  for(auto ci=0; ci<basesperhash; ++ci, ++srI){
3113
2876
    *srI=acgtc[hash&3];
3114
2877
    hash>>=2;
3115
2878
  }
3127
2890
 
3128
2891
  string tmpstr;
3129
2892
  for(auto & hse : HSN_hsv_hashstats){
3130
 
    hash2string(hse.vhash,tmpstr);
 
2893
    hash2string(hse.vhash,HSN_basesperhash,tmpstr);
3131
2894
    cout << tmpstr
3132
2895
         << '\t' << hse.hsc.fcount
3133
2896
         << '\t' << hse.hsc.rcount