1004
1004
return a.vhash < b.vhash;
1007
/*************************************************************************
1009
* sorter to sort from low to high, using a definable mask
1012
*************************************************************************/
1014
vhash_t HashStatistics__vhashmask;
1016
inline bool HashStatistics__sortHashStatComparatorByMask_(const hashstat_t & a,
1017
const hashstat_t & b)
1019
if((a.vhash & HashStatistics__vhashmask) != (b.vhash & HashStatistics__vhashmask)) {
1020
return (a.vhash & HashStatistics__vhashmask) < (b.vhash & HashStatistics__vhashmask);
1022
return a.vhash < b.vhash;
1025
/*************************************************************************
1027
* ideally, instead of using mincount, it should be mincount per
1028
* direction. NHashStatistics will do that ...
1030
*************************************************************************/
1034
void HashStatistics::calcKMerForks(uint32 mincount)
1036
if(HS_hs_hashstats.empty()) return;
1038
for(auto & hse : HS_hs_hashstats) hse.iskmerfork=false;
1040
if(HS_hs_basesperhash<17) return;
1042
auto rollbases=HS_hs_basesperhash-1;
1043
HashStatistics__vhashmask=1;
1044
/* *grml* undefined behaviour of left shift for 64 shifts in a 64 bit type makes this cludge necessary */
1045
/* the same for 32 shift in 32 bit types etc.pp */
1046
if(HS_hs_basesperhash>=sizeof(vhash_t)*4){
1047
HashStatistics__vhashmask=0;
1049
HashStatistics__vhashmask<<=(rollbases*2);
1052
// vhashmask is now, e.g. for bph=31, 00010000000....
1053
--HashStatistics__vhashmask;
1054
// vhashmask is now, e.g. for bph=31, 000011111....
1056
// calc the status on ?..............
1057
sort(HS_hs_hashstats.begin(), HS_hs_hashstats.end(), HashStatistics__sortHashStatComparatorByMask_);
1059
ckmf_helper(HashStatistics__vhashmask,mincount);
1061
// calc the status on ..............?
1062
HashStatistics__vhashmask<<=2;
1063
// vhashmask is now, e.g. for bph=31, 0011111....00
1064
sort(HS_hs_hashstats.begin(), HS_hs_hashstats.end(), HashStatistics__sortHashStatComparatorByMask_);
1067
ckmf_helper(HashStatistics__vhashmask,mincount);
1069
sort(HS_hs_hashstats.begin(), HS_hs_hashstats.end(), HashStat__sortHashStatComparatorByLow24bit_);
1072
//#define CEBUG(bla) {cout << bla; cout.flush();}
1073
void HashStatistics::ckmf_helper(vhash_t hashmask, uint32 mincount)
1075
auto hsI=HS_hs_hashstats.begin();
1078
CEBUG("KMER DUMP:\n");
1080
for(; hsJ!=HS_hs_hashstats.end(); ++hsI,++hsJ){
1081
if(hsI->hasfwdrevthresholdok
1082
&& hsJ->hasfwdrevthresholdok
1083
&& (hsI->vhash&hashmask) == (hsJ->vhash&hashmask)
1084
&& hsI->vhash != hsJ->vhash
1085
&& hsI->count >= mincount
1086
&& hsJ->count >= mincount){
1087
hsI->iskmerfork=true;
1088
hsJ->iskmerfork=true;
1090
CEBUG(laberbla << NHashStatistics::hash2string(hsI->vhash,HS_hs_basesperhash) << ' ' << *hsI << '\n');
1091
CEBUG(laberbla << NHashStatistics::hash2string(hsJ->vhash,HS_hs_basesperhash) << ' ' << *hsJ << '\n');
1094
//#define CEBUG(bla)
1008
1097
/*************************************************************************
1076
1165
*************************************************************************/
1078
inline bool Skim__compareHashStatHashElem_(const hashstat_t & a,
1079
const hashstat_t & b);
1080
inline bool Skim__compareHashStatHashElem_(const hashstat_t & a,
1081
const hashstat_t & b)
1167
inline bool HashStat__compareHashStatHashElem_(const hashstat_t & a,
1168
const hashstat_t & b);
1169
inline bool HashStat__compareHashStatHashElem_(const hashstat_t & a,
1170
const hashstat_t & b)
1083
1172
return a.vhash < b.vhash;
1086
1175
//#define CEBUG(bla) {cout << bla; cout.flush();}
1088
/*************************************************************************
1094
*************************************************************************/
1097
void HashStatistics::assignReadBaseStatistics(ReadPool & rp, size_t avghashcov, vector<hashstat_t> & hashstats, const uint8 basesperhash, vector<vector<hashstat_t>::const_iterator > & hsshortcuts_begin, vector<vector<hashstat_t>::const_iterator > & hsshortcuts_end, bool masknastyrepeats, vector<uint32> & rarekmermasking)
1099
FUNCSTART("void HashStatistics::assignReadBaseStatistics(ReadPool & rp, size_t avghashcov, vector<hashstat_t> & hashstats, const uint8 basesperhash, vector<vector<hashstat_t>::const_iterator > & hsshortcuts_begin, vector<vector<hashstat_t>::const_iterator > & hsshortcuts_end)");
1101
if(rarekmermasking.size() != ReadGroupLib::getNumSequencingTypes()){
1102
rarekmermasking.clear();
1103
rarekmermasking.resize(ReadGroupLib::getNumSequencingTypes(),0);
1106
uint32 minnormalhashcov=static_cast<uint32>(static_cast<double>(avghashcov)*HS_freqest_minnormal);
1107
uint32 maxnormalhashcov=static_cast<uint32>(static_cast<double>(avghashcov)*HS_freqest_maxnormal);
1108
uint32 repeathashcov=static_cast<uint32>(static_cast<double>(avghashcov)*HS_freqest_repeat);
1109
uint32 heavyrepthashcov=static_cast<uint32>(static_cast<double>(avghashcov)*HS_freqest_heavyrepeat);
1110
uint32 crazyrepthashcov=static_cast<uint32>(static_cast<double>(avghashcov)*HS_freqest_crazyrepeat);
1111
uint32 maskhashcov=static_cast<uint32>(static_cast<double>(avghashcov)*HS_nastyrepeatratio);
1113
CEBUG("minnormalhashcov: " << minnormalhashcov << endl);
1114
CEBUG("maxnormalhashcov: " << maxnormalhashcov << endl);
1115
CEBUG("repeathashcov: " << repeathashcov << endl);
1117
vector<vhrap_t> singlereadvhraparray;
1118
singlereadvhraparray.reserve(10000);
1120
// we will not use a mask, but
1121
// we need to supply an empty one anyway
1122
vector<uint8> tagmaskvector;
1124
// stores in each read whether the given hash frequency was seen
1125
vector<uint8> hasfrequency(8);
1127
vector<uint8> mcmask;
1128
mcmask.reserve(10000);
1130
ProgressIndicator<int32> P(0, rp.size());
1132
multitag_t tmpmt(Read::REA_defaulttag_MNRr);
1134
for(uint32 actreadid=0; actreadid<rp.size(); actreadid++){
1135
P.progress(actreadid);
1137
//if(actreadid>100) return;
1139
Read & actread= rp.getRead(actreadid);
1140
if(!actread.hasValidData()
1141
|| !actread.isUsedInAssembly()) continue;
1143
// get rid of old values
1144
actread.clearAllBPosHashStats();
1146
//#define CEBUG(bla) {if(cebugok) cout << bla; cout.flush();}
1147
// bool cebugok=false;
1148
// if(actread.getName()=="E0K6C4E01CTNQI") cebugok=true;
1150
uint32 slen=actread.getLenClippedSeq();
1152
if(slen<basesperhash) continue;
1155
mcmask.resize(actread.getLenSeq(),0);
1157
hasfrequency.clear();
1158
hasfrequency.resize(8,0);
1160
CEBUG("name: " << actread.getName() << '\n');
1162
//cout << "Before ...\n";
1163
//Read::setCoutType(Read::AS_TEXT);
1166
singlereadvhraparray.resize(slen);
1167
tagmaskvector.resize(slen,0);
1169
vector<vhrap_t>::iterator srvaI=singlereadvhraparray.begin();
1171
vector<Read::bposhashstat_t> & bposhashstats=const_cast<vector<Read::bposhashstat_t> &>(actread.getBPosHashStats());
1175
int32 bfpos=actread.calcClippedPos2RawPos(0);
1178
hashesmade=Skim::transformSeqToVariableHash(
1181
actread.getClippedSeqAsChar(),
1193
singlereadvhraparray.resize(hashesmade);
1195
CEBUG("hashesmade: " << hashesmade << endl);
1197
vector<hashstat_t>::const_iterator lowerbound;
1198
vector<hashstat_t>::const_iterator upperbound;
1200
vector<hashstat_t>::const_iterator hssearchI;
1201
srvaI=singlereadvhraparray.begin();
1203
uint32 rarekmercount=rarekmermasking[actread.getSequencingType()];
1204
//uint32 rarekmercount=30;
1206
int32 bfpos1,bfpos2;
1209
for(; srvaI != singlereadvhraparray.end(); srvaI++){
1210
CEBUG(*srvaI << '\n');
1213
lowerbound=hsshortcuts_begin[srvaI->vhash & SKIM3_MAXVHASHMASK];
1214
upperbound=hsshortcuts_end[srvaI->vhash & SKIM3_MAXVHASHMASK];
1216
// "HS_empty_vector_hashstat_t.end()" is the "nullptr" replacement
1217
if(hashstats.end() != lowerbound){
1218
if(basesperhash>12){
1219
// with more than 12 bases in a hash, the array is subdivided
1220
hstmp.vhash=srvaI->vhash;
1221
hssearchI=lower_bound(lowerbound,
1224
Skim__compareHashStatHashElem_);
1225
if(hssearchI != hashstats.end()
1226
&& hssearchI->vhash == srvaI->vhash) foundit=true;
1228
hssearchI=lowerbound;
1232
CEBUG("---------- NO LB HIT??? -------\n");
1236
CEBUG("VHRAP: " << *srvaI << '\n');
1237
CEBUG("HashStat: " << *hssearchI << '\n');
1238
CEBUG("srvaI->hashpos: " << srvaI->hashpos << '\n');
1240
bfpos1=actread.calcClippedPos2RawPos(srvaI->hashpos-(basesperhash-1));
1241
bfpos2=bfpos1+basesperhash-1;
1243
CEBUG("b bfpos1: " << bfpos1 << '\t' << bposhashstats[bfpos1] << endl);
1244
CEBUG("b bfpos2: " << bfpos2 << '\t' << bposhashstats[bfpos2] << endl);
1246
bposhashstats[bfpos1].fwd.setValid();
1247
bposhashstats[bfpos2].rev.setValid();
1249
if(hssearchI->hasfwdrevthresholdok) {
1250
//bhs|=Read::BFLAGS_CONFIRMED_FWDREV;
1251
CEBUG("Set ConfFWDREV\n");
1252
bposhashstats[bfpos1].fwd.setConfirmedFwdRev();
1253
bposhashstats[bfpos2].rev.setConfirmedFwdRev();
1255
if(hssearchI->lowpos<=4){
1256
//bhs|=Read::BFLAGS_SEENATLOWPOS;
1257
CEBUG("Set SeenAtLowPos\n");
1258
bposhashstats[bfpos1].fwd.setSeenAtLowPos();
1259
bposhashstats[bfpos2].rev.setSeenAtLowPos();
1261
if(hssearchI->hasmultipleseqtype){
1262
//bhs|=Read::BFLAGS_CONFIRMED_MULTIPLESEQTYPE;
1263
CEBUG("Set ConfMultSeqType\n");
1264
bposhashstats[bfpos1].fwd.setConfirmedMultipleSeqType();
1265
bposhashstats[bfpos2].rev.setConfirmedMultipleSeqType();
1268
if(hssearchI->count == 1){
1270
}else if(hssearchI->count <= rarekmercount ){
1271
// maybe additional checks ... ?
1273
}else if(hssearchI->count<minnormalhashcov) {
1275
}else if(hssearchI->count>=minnormalhashcov
1276
&& hssearchI->count<=maxnormalhashcov) {
1278
//}else if(hssearchI->count > minnormalhashcov*20){
1279
}else if(hssearchI->count > crazyrepthashcov){
1281
}else if(hssearchI->count > heavyrepthashcov){
1283
}else if(hssearchI->count>=repeathashcov){
1288
CEBUG("Set frequency: " << static_cast<uint16>(frequency) << endl);
1290
if(maskhashcov>0 && hssearchI->count>=maskhashcov){
1291
for(uint32 j=0; j<basesperhash; j++){
1296
CEBUG("a1 bfpos1: " << bfpos1 << '\t' << bposhashstats[bfpos1] << endl);
1297
CEBUG("a1 bfpos2: " << bfpos2 << '\t' << bposhashstats[bfpos2] << endl);
1299
bposhashstats[bfpos1].fwd.setFrequency(frequency);
1300
bposhashstats[bfpos2].rev.setFrequency(frequency);
1302
CEBUG("a2 bfpos1: " << bfpos1 << '\t' << bposhashstats[bfpos1] << endl);
1303
CEBUG("a2 bfpos2: " << bfpos2 << '\t' << bposhashstats[bfpos2] << endl);
1305
hasfrequency[frequency]=1;
1308
//actread.setBaseFlagsInClippedSequence(bhs,
1309
// srvaI->hashpos-(basesperhash-1),
1311
//actread.setHasBaseFlags(true);
1315
actread.setHasFreqAvg(false);
1316
actread.setHasFreqRept(false);
1318
if(hasfrequency[3]){
1319
actread.setHasFreqAvg(true);
1321
if(hasfrequency[5] || hasfrequency[6] || hasfrequency[7]){
1322
actread.setHasFreqRept(true);
1325
actread.setHasBaseHashStats(true);
1327
//cout << "After ...\n";
1328
//Read::setCoutType(Read::AS_TEXT);
1332
// BaCh 07.04.2009 Bad Idea!!!
1333
// BaCh 12.07.2009 Why? Forgot ... :-(
1334
//// the fwd/rev of a read now looks like this (e.g.)
1335
//// (for better viewing dot == 0)
1337
//// f ..........2222222233333....355555....................
1338
//// r ................2222222....33333355555...............
1340
//// in dubio pro reo and to allow for potential matches,
1343
//// f ..........2222222233333....355555->..................
1344
//// r ..............<-2222222....33333355555...............
1348
//// f ..........2222222233333....35555555555...............
1349
//// r ..........2222222222222....33333355555...............
1357
// for(; bfposi<bposhashstats.size() && bposhashstats[bfposi].fwd.getFrequency()==0; bfposi++) {};
1358
// uint32 bfpose=bfposi;
1359
// for(; bfpose<bposhashstats.size() && bposhashstats[bfpose].rev.getFrequency()==0; bfpose++) {};
1360
// if(bfposi<bposhashstats.size() && bfpose<bposhashstats.size()){
1361
// for(uint32 i=bfposi; i<bfpose; i++){
1362
// bposhashstats[i].fwd=bposhashstats[bfpose].rev;
1366
// bfposi=bposhashstats.size()-1;
1367
// for(; bfposi>0 && bposhashstats[bfposi].rev.getFrequency()==0; bfposi--) {};
1369
// for(; bfpose>0 && bposhashstats[bfpose].fwd.getFrequency()==0; bfpose--) {};
1371
// for(uint32 i=bfposi; i>bfpose; i--){
1372
// bposhashstats[i].fwd=bposhashstats[bfpose].rev;
1378
// go through multicopy array and set MNRr tags for
1379
// consecutive positions in read tagged as multicopy
1380
if(masknastyrepeats){
1384
for(; pos<mcmask.size(); pos++){
1385
CEBUG("pos: " << pos << '\t' << static_cast<uint16>(mcmask[pos]) << '\t' << inrun << '\n');
1393
CEBUG("reprun " << actread.getName() << '\t' << runstart << '\t' << pos-1 << endl);
1394
tmpmt.from=runstart;
1396
actread.addTagO(tmpmt);
1402
CEBUG("reprun " << actread.getName() << '\t' << runstart << '\t' << pos-1 << endl);
1403
tmpmt.from=runstart;
1405
actread.addTagO(tmpmt);
1416
//cout << "\nskim Needs redo!\n";
1422
//#define CEBUG(bla)
1426
1179
/*************************************************************************