139
while (( i=*(probe++) )) {
140
if (i == PT_G || i == PT_C) t+=4.0;else t+= 2.0;
360
while ((i=*(probe++))) {
361
if (i == PT_G || i == PT_C) t+=4.0; else t += 2.0;
146
int ptnd_check_pure(char *probe)
149
while (( i=*(probe++) )) {
150
if ( i < PT_A || i > PT_T) return 1;
156
static void ptnd_calc_quality(PT_pdc *pdc) {
160
for (tprobe = pdc->tprobes;
162
tprobe = tprobe->next)
164
for (i=0; i< PERC_SIZE-1; i++) {
165
if (tprobe->perc[i] > tprobe->mishit) break;
167
tprobe->quality = ((double)tprobe->groupsize * i) + 1000.0/(1000.0 + tprobe->perc[i]) ;
170
/***********************************************************************
171
check the bond val for a probe
172
***********************************************************************/
173
static double ptnd_check_max_bond( PT_pdc *pdc, char base)
175
int complement = PT_complement(base);
176
return pdc->bond[ (complement-(int)PT_A)*4 + base-(int)PT_A].val;
179
double ptnd_check_split(PT_pdc *pdc, char *probe, int pos, char ref) {
180
int base = probe[pos];
181
int complement = PT_complement(base);
182
double max_bind = pdc->bond[ (complement-(int)PT_A)*4 + base-(int)PT_A].val;
183
double new_bind = pdc->bond[ (complement-(int)PT_A)*4 + ref-(int)PT_A].val;
185
if ( new_bind < pdc->split)
186
return new_bind-max_bind; /* negative values indicate split */
187
/* this mismatch splits the probe in several domains */
188
return (max_bind - new_bind);
192
/***********************************************************************
193
primary search for probes
194
***********************************************************************/
195
/* count all mishits for a probe */
197
struct ptnd_chain_count_mishits {
198
int operator()(int name, int apos, int rpos) {
199
char *probe = psg.probe;
200
psg.abs_pos.announce(apos);
201
if (psg.data[name].is_group) return 0; /* dont count group or neverminds */
204
while (*probe && psg.data[name].data[rpos]) {
205
if (psg.data[name].data[rpos] != *(probe))
366
static void tprobes_sumup_perc_and_calc_quality(PT_pdc *pdc) {
367
for (PT_tprobes *tprobe = pdc->tprobes; tprobe; tprobe = tprobe->next) {
370
for (i=0; i<PERC_SIZE; ++i) {
371
sum += tprobe->perc[i];
372
tprobe->perc[i] = sum;
375
// pt_assert(tprobe->perc[0] == tprobe->mishits); // OutgroupMatcher and count_mishits_for_matched do not agree!
376
// @@@ found one case where it differs by 1 (6176 in perc, 6177 in mishits). Can this be caused somehow by N or IUPAC in seq?
378
int limit = 2*tprobe->mishits;
379
for (i=0; i<(PERC_SIZE-1); ++i) {
380
if (tprobe->perc[i]>limit) break;
383
tprobe->quality = ((double)tprobe->groupsize * i) + 1000.0/(1000.0 + tprobe->perc[i]);
387
static double ptnd_check_max_bond(const PT_local *locs, char base) {
388
//! check the bond val for a probe
390
int complement = get_complement(base);
391
return locs->bond[(complement-(int)PT_A)*4 + base-(int)PT_A].val;
394
static char hitgroup_idx2char(int idx) {
395
const int firstSmall = ALPHA_SIZE/2;
397
if (idx >= firstSmall) {
398
c = 'a'+(idx-firstSmall);
403
pt_assert(isalpha(c));
407
inline int get_max_probelen(const PT_pdc *pdc) {
408
return pdc->max_probelen == -1 ? pdc->min_probelen : pdc->max_probelen;
411
inline int shown_apos(const PT_tprobes *tprobe) { return info2bio(tprobe->apos); }
412
inline int shown_ecoli(const PT_tprobes *tprobe) { return PT_abs_2_ecoli_rel(tprobe->apos+1); }
413
inline int shown_qual(const PT_tprobes *tprobe) { return int(tprobe->quality+0.5); }
416
int width[PERC_SIZE]; // of centigrade list columns
424
static inline void track_max(int& tracked, int val) { tracked = std::max(tracked, val); }
426
void collect(const int *perc) { for (int i = 0; i<PERC_SIZE; ++i) width[i] = std::max(width[i], perc[i]); }
427
void collect(const PT_tprobes *tprobe) {
428
collect(tprobe->perc);
429
track_max(apos_width, shown_apos(tprobe));
430
track_max(ecol_width, shown_ecoli(tprobe));
431
track_max(qual_width, shown_qual(tprobe));
432
track_max(grps_width, shown_qual(tprobe));
433
track_max(maxprobelen, tprobe->seq_len);
437
for (int i = 0; i<PERC_SIZE; ++i) width[i] = 0;
447
static inline int width4num(int num) {
455
return digits ? digits : 1;
459
PD_formatter() { clear(); }
460
PD_formatter(const PT_pdc *pdc) {
463
// collect max values for output columns:
464
for (PT_tprobes *tprobe = pdc->tprobes; tprobe; tprobe = tprobe->next) collect(tprobe);
466
// convert max.values to needed print-width:
467
for (int i = 0; i<PERC_SIZE; ++i) width[i] = width4num(width[i]);
469
apos_width = std::max(width4num(apos_width), 2);
470
ecol_width = std::max(width4num(ecol_width), 4);
471
qual_width = std::max(width4num(qual_width), 4);
472
grps_width = std::max(width4num(grps_width), 4);
475
int sprint_centigrade_list(char *buffer, const int *perc) const {
476
// format centigrade-decrement-list
478
for (int i = 0; i<PERC_SIZE; ++i) {
479
buffer[printed++] = ' ';
481
? sprintf(buffer+printed, "%*i", width[i], perc[i])
482
: sprintf(buffer+printed, "%*s", width[i], "-");
487
int get_max_designed_len() const { return maxprobelen; }
489
int get_apos_width() const { return apos_width; }
490
int get_ecol_width() const { return ecol_width; }
491
int get_qual_width() const { return qual_width; }
492
int get_grps_width() const { return grps_width; }
215
/* go down the tree to chains and leafs; count the species that are in the non member group */
216
static int ptnd_count_mishits2(POS_TREE *pt)
495
typedef std::map<const PT_pdc*const, PD_formatter> PD_Formatter4design;
496
static PD_Formatter4design format4design;
224
if (PT_read_type(pt) == PT_NT_LEAF) {
225
name = PT_read_name(psg.ptmain,pt);
226
int apos = PT_read_apos(psg.ptmain,pt);
227
psg.abs_pos.announce(apos);
228
if (!psg.data[name].is_group) return 1;
230
}else if (PT_read_type(pt) == PT_NT_CHAIN) {
233
PT_read_chain(psg.ptmain,pt, ptnd_chain_count_mishits());
236
for (base = PT_QU; base< PT_B_MAX; base++) {
237
mishits += ptnd_count_mishits2(PT_read_son(psg.ptmain,pt,(PT_BASES)base));
498
inline const PD_formatter& get_formatter(const PT_pdc *pdc) {
499
PD_Formatter4design::iterator found = format4design.find(pdc);
500
if (found == format4design.end()) {
501
format4design[pdc] = PD_formatter(pdc);
502
found = format4design.find(pdc);
504
pt_assert(found != format4design.end());
506
return found->second;
242
extern "C" char *get_design_info(PT_tprobes *tprobe)
244
char *buffer = (char *)GB_give_buffer(2000);
245
char *probe = (char *)GB_give_buffer2(tprobe->seq_len + 10);
246
PT_pdc *pdc = (PT_pdc *)tprobe->mh.parent->parent;
508
inline void erase_formatter(const PT_pdc *pdc) { format4design.erase(pdc); }
510
char *get_design_info(const PT_tprobes *tprobe) {
513
const int BUFFERSIZE = 2000;
514
char *buffer = (char *)GB_give_buffer(BUFFERSIZE);
515
PT_pdc *pdc = (PT_pdc *)tprobe->mh.parent->parent;
517
const PD_formatter& formatter = get_formatter(pdc);
254
strcpy(probe,tprobe->sequence);
255
PT_base_2_string(probe,0); /* convert probe to real ASCII */
256
sprintf(p, "%-*s", pdc->probelen+1, probe);
259
int apos = tprobe->apos;
262
{ // search nearest hit
263
for (c=0;c<ALPHA_SIZE;c++) {
521
int len = tprobe->seq_len;
522
char *probe = (char *)GB_give_buffer2(len+1);
523
strcpy(probe, tprobe->sequence);
525
probe_2_readable(probe, len); // convert probe to real ASCII
526
p += sprintf(p, "%-*s", formatter.get_max_designed_len()+1, probe);
530
int apos = shown_apos(tprobe);
534
// search nearest previous hit
535
for (c=0; c<ALPHA_SIZE; c++) {
264
536
if (!pdc->pos_groups[c]) { // new group
265
pdc->pos_groups[c] = tprobe->apos;
537
pdc->pos_groups[c] = apos;
268
int dist = tprobe->apos - pdc->pos_groups[c];
269
if (dist<0) dist = -dist;
270
if ( dist < pdc->probelen) {
271
apos = tprobe->apos - pdc->pos_groups[c];
541
int dist = abs(apos - pdc->pos_groups[c]);
542
if (dist < pdc->min_probelen) {
543
apos = apos - pdc->pos_groups[c];
275
apos = -apos; cs = '-';
555
c = hitgroup_idx2char(c);
558
// ran out of pos_groups
564
p += sprintf(p, "%2i ", tprobe->seq_len);
565
p += sprintf(p, "%c%c%*i ", c, cs, formatter.get_apos_width(), apos);
566
p += sprintf(p, "%*i ", formatter.get_ecol_width(), shown_ecoli(tprobe)); // ecoli-bases inclusive apos ("bases before apos+1")
283
sprintf(p,"%2i %c%c%6i %4li ", tprobe->seq_len, c+'A', cs, apos, PT_abs_2_rel(tprobe->apos));
287
sprintf(p,"%4i ",tprobe->groupsize);
291
sprintf(p,"%-4.1f ",((double)pt_get_gc_content(tprobe->sequence))/tprobe->seq_len*100.0);
295
sprintf(p,"%-7.1f ", pt_get_temperature(tprobe->sequence));
569
p += sprintf(p, "%*i ", formatter.get_qual_width(), shown_qual(tprobe));
570
p += sprintf(p, "%*i ", formatter.get_grps_width(), tprobe->groupsize);
571
p += sprintf(p, "%5.1f", ((double)pt_get_gc_content(tprobe->sequence))/tprobe->seq_len*100.0); // G+C
572
p += sprintf(p, "%5.1f ", pt_get_temperature(tprobe->sequence)); // temperature
299
probe = reverse_probe(tprobe->sequence,0);
300
complement_probe(probe,0);
301
PT_base_2_string(probe,0); /* convert probe to real ASCII */
302
sprintf(p,"%-*s |", pdc->probelen, probe);
576
char *probe = create_reversed_probe(tprobe->sequence, tprobe->seq_len);
577
complement_probe(probe, tprobe->seq_len);
578
probe_2_readable(probe, tprobe->seq_len); // convert probe to real ASCII
579
p += sprintf(p, "%*s |", formatter.get_max_designed_len(), probe);
306
583
// non-group hits by temp. decrease
307
for (sum=i=0;i<PERC_SIZE;i++) {
308
sum+= tprobe->perc[i];
309
sprintf(p,"%3i,",sum);
317
extern "C" char *get_design_hinfo(PT_tprobes *tprobe) {
318
char *buffer = (char *)GB_give_buffer(2000);
323
return (char*)"Sorry, there are no probes for your selection !!!";
325
pdc = (PT_pdc *)tprobe->mh.parent->parent;
327
"Probe design Parameters:\n"
328
"Length of probe %4i\n"
329
"Temperature [%4.1f -%4.1f]\n"
330
"GC-Content [%4.1f -%4.1f]\n"
331
"E.Coli Position [%4i -%4i]\n"
332
"Max Non Group Hits %4i\n"
333
"Min Group Hits %4.0f%%\n",
335
pdc->mintemp,pdc->maxtemp,
336
pdc->min_gc*100.0,pdc->max_gc*100.0,
337
pdc->minpos, pdc->maxpos,
338
pdc->mishit, pdc->mintarget*100.0);
342
sprintf(s,"%-*s", pdc->probelen+1, "Target");
345
sprintf(s,"%2s %8s %4s ","le","apos","ecol");
348
sprintf(s,"%4s ","grps"); // groupsize
351
sprintf(s,"%-4s %-7s %-*s | "," G+C","4GC+2AT", pdc->probelen, "Probe sequence");
354
sprintf(s, "Decrease T by n*.3C -> probe matches n non group species");
360
/* search down the tree to find matching species for the given probe */
361
static int ptnd_count_mishits(char *probe, POS_TREE *pt,int height)
584
p += formatter.sprint_centigrade_list(p, tprobe->perc);
585
if (!tprobe->next) erase_formatter(pdc); // erase formatter when done with last probe
587
pt_assert((p-buffer)<BUFFERSIZE);
591
char *get_design_hinfo(const PT_pdc *pdc) {
592
const int BUFFERSIZE = 2000;
593
char *buffer = (char *)GB_give_buffer(BUFFERSIZE);
596
const PD_formatter& formatter = get_formatter(pdc);
599
char *ecolipos = NULL;
600
if (pdc->min_ecolipos == -1) {
601
if (pdc->max_ecolipos == -1) {
602
ecolipos = strdup("any");
605
ecolipos = GBS_global_string_copy("<= %i", pdc->max_ecolipos);
609
if (pdc->max_ecolipos == -1) {
610
ecolipos = GBS_global_string_copy(">= %i", pdc->min_ecolipos);
613
ecolipos = GBS_global_string_copy("%4i -%5i", pdc->min_ecolipos, pdc->max_ecolipos);
617
char *mishit_annotation = NULL;
618
if (pdc->min_rj_mishits<INT_MAX) {
619
mishit_annotation = GBS_global_string_copy(" (lowest rejected nongroup hits: %i)", pdc->min_rj_mishits);
622
char *coverage_annotation = NULL;
623
if (pdc->max_rj_coverage>0.0) {
624
coverage_annotation = GBS_global_string_copy(" (max. rejected coverage: %.0f%%)", pdc->max_rj_coverage*100.0);
628
if (get_max_probelen(pdc) == pdc->min_probelen) {
629
probelen_info = GBS_global_string_copy("%i", pdc->min_probelen);
632
probelen_info = GBS_global_string_copy("%i-%i", pdc->min_probelen, get_max_probelen(pdc));
636
"Probe design parameters:\n"
637
"Length of probe %s\n"
638
"Temperature [%4.1f -%5.1f]\n"
639
"GC-content [%4.1f -%5.1f]\n"
640
"E.Coli position [%s]\n"
641
"Max. nongroup hits %i%s\n"
642
"Min. group hits %.0f%%%s\n",
644
pdc->mintemp, pdc->maxtemp,
645
pdc->min_gc*100.0, pdc->max_gc*100.0,
647
pdc->max_mishits, mishit_annotation ? mishit_annotation : "",
648
pdc->min_coverage*100.0, coverage_annotation ? coverage_annotation : "");
651
free(coverage_annotation);
652
free(mishit_annotation);
658
int maxprobelen = formatter.get_max_designed_len();
660
s += sprintf(s, "%-*s", maxprobelen+1, "Target");
661
s += sprintf(s, "le ");
662
s += sprintf(s, "%*s ", formatter.get_apos_width()+2, "apos");
663
s += sprintf(s, "%*s ", formatter.get_ecol_width(), "ecol");
664
s += sprintf(s, "%*s ", formatter.get_qual_width(), "qual");
665
s += sprintf(s, "%*s ", formatter.get_grps_width(), "grps");
666
s += sprintf(s, " G+C temp ");
667
s += sprintf(s, "%*s | ", maxprobelen, maxprobelen<14 ? "Probe" : "Probe sequence");
668
s += sprintf(s, "Decrease T by n*.3C -> probe matches n non group species");
671
s += sprintf(s, "No probes found!");
672
erase_formatter(pdc);
675
pt_assert((s-buffer)<BUFFERSIZE);
680
struct ptnd_chain_count_mishits {
685
ptnd_chain_count_mishits() : mishits(0), probe(NULL), height(0) {}
686
ptnd_chain_count_mishits(const char *probe_, int height_) : mishits(0), probe(probe_), height(height_) {}
688
int operator()(const AbsLoc& probeLoc) {
689
// count all mishits for a probe
691
psg.abs_pos.announce(probeLoc.get_abs_pos());
693
if (probeLoc.get_pid().outside_group()) {
695
pt_assert(probe[0]); // if this case occurs, avoid entering this branch
697
DataLoc dataLoc(probeLoc);
698
ReadableDataLoc readableLoc(dataLoc);
699
for (int i = 0; probe[i] && readableLoc[height+i]; ++i) {
700
if (probe[i] != readableLoc[height+i]) return 0;
709
static int count_mishits_for_all(POS_TREE2 *pt) {
710
//! go down the tree to chains and leafs; count the species that are in the non member group
716
psg.abs_pos.announce(loc.get_abs_pos());
717
return loc.get_pid().outside_group();
720
if (pt->is_chain()) {
721
ptnd_chain_count_mishits counter;
722
PT_forwhole_chain(pt, counter); // @@@ expand
723
return counter.mishits;
727
for (int base = PT_QU; base< PT_BASES; base++) {
728
mishits += count_mishits_for_all(PT_read_son(pt, (PT_base)base));
733
static int count_mishits_for_matched(char *probe, POS_TREE2 *pt, int height) {
734
//! search down the tree to find matching species for the given probe
369
735
if (!pt) return 0;
371
if (PT_read_type(pt) == PT_NT_NODE && *probe) {
372
for (i=PT_A; i<PT_B_MAX; i++) {
373
if (i!= *probe) continue;
374
if (( pthelp = PT_read_son(psg.ptmain,pt, (PT_BASES)i) ))
375
mishits += ptnd_count_mishits(probe+1,pthelp, height+1);
736
if (pt->is_node() && *probe) {
738
for (int i=PT_A; i<PT_BASES; i++) {
739
if (i != *probe) continue;
740
POS_TREE2 *pthelp = PT_read_son(pt, (PT_base)i);
741
if (pthelp) mishits += count_mishits_for_matched(probe+1, pthelp, height+1);
380
if (PT_read_type(pt) == PT_NT_LEAF) {
381
pos = PT_read_rpos(psg.ptmain,pt) + height;
382
name = PT_read_name(psg.ptmain,pt);
383
if (pos + (int)(strlen(probe)) >= psg.data[name].size) /* after end */
747
const ReadableDataLoc loc(pt);
749
int pos = loc.get_rel_pos()+height;
751
if (pos + (int)(strlen(probe)) >= loc.get_pid().get_size()) // after end // @@@ wrong check ? better return from loop below when ref is PT_QU
387
if (psg.data[name].data[pos++] != *(probe++))
754
for (int i = 0; probe[i] && loc[height+i]; ++i) {
755
if (probe[i] != loc[height+i]) {
394
PT_read_chain(psg.ptmain,pt, ptnd_chain_count_mishits());
761
ptnd_chain_count_mishits counter(probe, height);
762
PT_forwhole_chain(pt, counter); // @@@ expand
763
return counter.mishits;
398
return ptnd_count_mishits2(pt);
766
return count_mishits_for_all(pt);
401
static void ptnd_first_check(PT_pdc *pdc) /* checks the direkt mishits */
403
PT_tprobes *tprobe, *tprobe_next;
404
for ( tprobe = pdc->tprobes;
406
tprobe = tprobe_next ) {
407
tprobe_next = tprobe->next;
769
static void remove_tprobes_with_too_many_mishits(PT_pdc *pdc) {
770
//! Check for direct mishits
771
// tracks minimum rejected mishits in 'pdc->min_rj_mishits'
772
int min_rej_mishit_amount = INT_MAX;
774
for (PT_tprobes *tprobe = pdc->tprobes; tprobe; ) {
775
PT_tprobes *tprobe_next = tprobe->next;
408
777
psg.abs_pos.clear();
409
tprobe->mishit = ptnd_count_mishits(tprobe->sequence,psg.pt,0);
410
tprobe->apos = psg.abs_pos.get_most_used();
411
if (tprobe->mishit > pdc->mishit) {
778
tprobe->mishits = count_mishits_for_matched(tprobe->sequence, psg.TREE_ROOT2(), 0);
779
tprobe->apos = psg.abs_pos.get_most_used();
780
if (tprobe->mishits > pdc->max_mishits) {
781
min_rej_mishit_amount = std::min(min_rej_mishit_amount, tprobe->mishits);
412
782
destroy_PT_tprobes(tprobe);
784
tprobe = tprobe_next;
415
786
psg.abs_pos.clear();
788
pdc->min_rj_mishits = std::min(pdc->min_rj_mishits, min_rej_mishit_amount);
417
/***********************************************************************
418
Check the probes Position
419
***********************************************************************/
421
static void ptnd_check_position(PT_pdc *pdc) /* checks the direkt mishits */
423
PT_tprobes *tprobe, *tprobe_next;
424
if ( pdc->minpos == pdc->maxpos) return;
426
for ( tprobe = pdc->tprobes;
428
tprobe = tprobe_next ) {
429
tprobe_next = tprobe->next;
430
long relpos = PT_abs_2_rel(tprobe->apos);
431
if (relpos < pdc->minpos || relpos > pdc->maxpos) {
791
static void remove_tprobes_outside_ecoli_range(PT_pdc *pdc) {
792
//! Check the probes position.
793
// if (pdc->min_ecolipos == pdc->max_ecolipos) return; // @@@ wtf was this for?
795
for (PT_tprobes *tprobe = pdc->tprobes; tprobe; ) {
796
PT_tprobes *tprobe_next = tprobe->next;
797
long relpos = PT_abs_2_ecoli_rel(tprobe->apos+1);
798
if (relpos < pdc->min_ecolipos || (relpos > pdc->max_ecolipos && pdc->max_ecolipos != -1)) {
432
799
destroy_PT_tprobes(tprobe);
801
tprobe = tprobe_next;
436
/***********************************************************************
437
check the average bond size
438
***********************************************************************/
439
static void ptnd_check_bonds(PT_pdc *pdc, int match) /* checks probe hairpin bonds */
441
PT_tprobes *tprobe, *tprobe_next;
444
for ( tprobe = pdc->tprobes;
446
tprobe = tprobe_next ) {
447
tprobe_next = tprobe->next;
805
static size_t tprobes_calculate_bonds(PT_local *locs) {
806
/*! check the average bond size.
807
* returns number of tprobes
809
* @TODO checks probe hairpin bonds
812
PT_pdc *pdc = locs->pdc;
814
for (PT_tprobes *tprobe = pdc->tprobes; tprobe; ) {
815
PT_tprobes *tprobe_next = tprobe->next;
448
816
tprobe->seq_len = strlen(tprobe->sequence);
450
for (i=0;i<tprobe->seq_len;i++) {
451
sbond += ptnd_check_max_bond(pdc,tprobe->sequence[i]);
818
for (int i=0; i<tprobe->seq_len; i++) {
819
sbond += ptnd_check_max_bond(locs, tprobe->sequence[i]);
453
821
tprobe->sum_bonds = sbond;
457
/***********************************************************************
458
split the probes in probeparts
459
***********************************************************************/
460
static void ptnd_cp_tprobe_2_probepart(PT_pdc *pdc)
463
PT_probeparts *parts;
467
while (pdc->parts) destroy_PT_probeparts(pdc->parts);
468
for ( tprobe = pdc->tprobes;
470
tprobe = tprobe->next ) {
471
probelen = strlen(tprobe->sequence);
472
probelen -= DOMAIN_MIN_LENGTH;
473
for (pos = 0; pos < probelen; pos ++ ) {
474
parts = create_PT_probeparts();
475
parts->sequence = strdup(tprobe->sequence+pos);
476
parts->source = tprobe;
478
aisc_link(&pdc->pparts, parts);
482
static void ptnd_duplikate_probepart_rek(PT_pdc *pdc, char *insequence, int deep, double dt,double sum_bonds, PT_probeparts *parts)
484
PT_probeparts *newparts;
489
double ndt,nsum_bonds;
492
sequence = strdup(insequence);
493
if (deep >= PT_PART_DEEP) { /* now do it */
494
newparts = create_PT_probeparts();
495
newparts->sequence = sequence;
496
newparts->source = parts->source;
498
newparts->start = parts->start;
499
newparts->sum_bonds = sum_bonds;
500
aisc_link(&pdc->pdparts, newparts);
503
base = sequence[deep];
504
max_bind = ptnd_check_max_bond(pdc, base);
505
for (i = PT_A; i <=PT_T; i++) {
507
if ( (split = ptnd_check_split(pdc, insequence, deep, i)) < 0.0) continue;
508
/* this mismatch splits the probe in several domains */
510
nsum_bonds = sum_bonds+max_bind-split;
511
ptnd_duplikate_probepart_rek(pdc,sequence,deep+1,ndt,nsum_bonds,parts);
515
static void ptnd_duplikate_probepart(PT_pdc *pdc)
517
PT_probeparts *parts;
518
for (parts = pdc->parts; parts; parts = parts->next)
519
ptnd_duplikate_probepart_rek(pdc,parts->sequence,0,parts->dt,0.0, parts);
521
while (( parts = pdc->parts ))
522
destroy_PT_probeparts(parts); /* delete the source */
524
while (( parts = pdc->dparts ))
526
aisc_unlink((struct_dllheader_ext*)parts);
527
aisc_link(&pdc->pparts, parts);
530
/***********************************************************************
531
sort the parts and check for duplicated parts
532
***********************************************************************/
534
static int ptnd_compare_parts(const void *PT_probeparts_ptr1, const void *PT_probeparts_ptr2, void*) {
535
const PT_probeparts *tprobe1 = (const PT_probeparts*)PT_probeparts_ptr1;
536
const PT_probeparts *tprobe2 = (const PT_probeparts*)PT_probeparts_ptr2;
538
return strcmp(tprobe1->sequence,tprobe2->sequence);
541
static void ptnd_sort_parts(PT_pdc *pdc)
543
PT_probeparts **my_list;
545
PT_probeparts *tprobe;
548
if (!pdc->parts) return;
549
list_len = get_TPROBEPARTS_CNT(pdc->parts);
550
if (list_len <= 1) return;
551
my_list = (PT_probeparts **)calloc(sizeof(void *),list_len);
552
for ( i=0, tprobe = pdc->parts;
554
i++,tprobe=tprobe->next ) {
557
GB_sort((void **)my_list, 0, list_len, ptnd_compare_parts, 0);
559
for (i=0;i<list_len;i++) {
560
aisc_unlink((struct_dllheader_ext*)my_list[i]);
561
aisc_link(&pdc->pparts, my_list[i]);
563
free((char *)my_list);
565
static void ptnd_remove_duplikated_probepart(PT_pdc *pdc)
567
PT_probeparts *parts, *parts_next;
568
for ( parts = pdc->parts;
570
parts = parts_next) {
571
parts_next = parts->next;
573
if ( (parts->source== parts_next->source) && !strcmp(parts->sequence, parts_next->sequence)) { /* equal sequence */
574
if (parts->dt < parts_next->dt) { /* delete higher dt */
575
destroy_PT_probeparts(parts_next);
578
destroy_PT_probeparts(parts);
584
/***********************************************************************
585
test the probe parts, search the longest non mismatch string
586
***********************************************************************/
587
static void ptnd_check_part_inc_dt(PT_pdc *pdc, PT_probeparts *parts,
588
int name, int apos, int rpos,
589
double dt,double sum_bonds)
591
PT_tprobes *tprobe = parts->source;
599
if (( start = parts->start )) { /* look backwards */
600
probe = parts->source->sequence;
602
start--; /* test the base left of start */
605
if (pos<0) break; /* out of sight */
606
h = ptnd_check_split(ptnd.pdc, probe, start,psg.data[name].data[pos]);
607
if (h>0.0 && !split) return; /* there is a longer part matching this */
614
ndt = (dt * pdc->dt + (tprobe->sum_bonds - sum_bonds)*pdc->dte ) / tprobe->seq_len;
619
if (pos >= PERC_SIZE) return; /* out of observation */
620
tprobe->perc[pos] ++;
621
if (ptnd.new_match) { /* save the result in probematch */
622
PT_probematch *match;
623
if (psg.data[name].match) {
624
if (psg.data[name].match->dt < ndt) return;
625
/* there is a better hit for that sequence */
626
match = psg.data[name].match;
628
match = create_PT_probematch();
629
aisc_link(&ptnd.locs->ppm, match);
630
psg.data[name].match = match;
633
match->b_pos = apos - parts->start; /* thats not correct !!! */
634
match->rpos = rpos-parts->start;
635
match->N_mismatches = -1; /* there are no mismatches in this mode */
636
match->mismatches = -1;
637
match->wmismatches = dt; /* only weighted mismatches (maybe) */
639
match->sequence = psg.main_probe;
640
match->reversed = psg.reversed;
643
static int ptnd_check_inc_mode(PT_pdc *pdc ,PT_probeparts *parts,double dt,double sum_bonds)
645
PT_tprobes *tprobe = parts->source;
646
double ndt = (dt * pdc->dt + (tprobe->sum_bonds - sum_bonds)*pdc->dte ) / tprobe->seq_len;
651
if (pos >= PERC_SIZE) return 1; /* out of observation */
655
struct ptnd_chain_check_part {
656
ptnd_chain_check_part(int s) : split(s) {}
658
int operator() (int name, int apos, int rpos)
660
char *probe = psg.probe;
661
int height = psg.height;
662
double sbond = ptnd.sum_bonds;
668
if (!ptnd.new_match && psg.data[name].is_group) return 0; /* dont count group or neverminds */
670
pos = rpos+psg.height;
671
while (probe[height] && (base = psg.data[name].data[pos])) {
672
if (!split && (h = ptnd_check_split(ptnd.pdc, probe, height,
678
sbond += ptnd_check_max_bond(ptnd.pdc,probe[height]) - h;
683
ptnd_check_part_inc_dt(ptnd.pdc,ptnd.parts,name,apos,rpos,dt,sbond);
688
/* go down the tree to chains and leafs; check all */
689
static void ptnd_check_part_all(POS_TREE *pt, double dt, double sum_bonds)
692
int name, apos, rpos;
696
if (PT_read_type(pt) == PT_NT_LEAF) {
697
name = PT_read_name(psg.ptmain,pt);
698
if (!ptnd.new_match && psg.data[name].is_group) return;
699
rpos = PT_read_rpos(psg.ptmain,pt);
700
apos = PT_read_apos(psg.ptmain,pt);
701
ptnd_check_part_inc_dt( ptnd.pdc, ptnd.parts,
702
name,apos,rpos, dt, sum_bonds);
703
}else if (PT_read_type(pt) == PT_NT_CHAIN) {
706
ptnd.sum_bonds = sum_bonds;
707
PT_read_chain(psg.ptmain,pt, ptnd_chain_check_part(0));
709
for (base = PT_QU; base< PT_B_MAX; base++) {
710
ptnd_check_part_all(PT_read_son(psg.ptmain,pt,(PT_BASES)base),dt,sum_bonds);
714
/* search down the tree to find matching species for the given probe */
715
static void ptnd_check_part(char *probe, POS_TREE *pt,int height, double dt, double sum_bonds, int split)
721
double ndt,nsum_bonds,h;
726
if (dt/ptnd.parts->source->seq_len > PERC_SIZE) return; /* out of scope */
727
if (PT_read_type(pt) == PT_NT_NODE && probe[height]) {
728
if (split && ptnd_check_inc_mode(ptnd.pdc ,ptnd.parts, dt, sum_bonds)) return;
729
for (i=PT_A; i<PT_B_MAX; i++) {
730
if (( pthelp = PT_read_son(psg.ptmain,pt, (PT_BASES)i) ))
733
nsum_bonds = sum_bonds;
734
if (height < PT_PART_DEEP) {
735
if ( i != probe[height] ) continue;
739
h = ptnd_check_split(ptnd.pdc, probe, height,i);
740
if (h>0.0) ndt = dt+h; else ndt = dt-h;
741
}else if ((h = ptnd_check_split(ptnd.pdc, probe, height,i)) < 0.0 ) {
747
ptnd_check_max_bond(ptnd.pdc,probe[height]) - h;
750
if (nsplit && height <= DOMAIN_MIN_LENGTH) continue;
751
ptnd_check_part(probe,pthelp,height+1,ndt,nsum_bonds,nsplit);
757
if (PT_read_type(pt) == PT_NT_LEAF) {
758
name = PT_read_name(psg.ptmain,pt);
759
if (!ptnd.new_match && psg.data[name].is_group) return;
760
rpos = PT_read_rpos(psg.ptmain,pt);
761
apos = PT_read_apos(psg.ptmain,pt);
763
if (pos + (int)(strlen(probe+height)) >= psg.data[name].size) /* after end */
765
while (probe[height] && (ref = psg.data[name].data[pos])) {
767
h = ptnd_check_split(ptnd.pdc, probe, height,ref);
768
if (h<0.0) dt -= h; else dt += h;
769
}else if ( (h = ptnd_check_split(ptnd.pdc, probe, height,
775
sum_bonds += ptnd_check_max_bond(ptnd.pdc,probe[height]) - h;
779
ptnd_check_part_inc_dt( ptnd.pdc,ptnd.parts,
780
name, apos, rpos, dt, sum_bonds);
786
ptnd.sum_bonds = sum_bonds;
787
PT_read_chain(psg.ptmain,pt, ptnd_chain_check_part(split));
791
ptnd_check_part_all(pt,dt,sum_bonds);
793
static void ptnd_check_probepart(PT_pdc *pdc)
795
PT_probeparts *parts;
797
for ( parts = pdc->parts;
799
parts = parts->next) {
801
ptnd_check_part(parts->sequence, psg.pt, 0, parts->dt, parts->sum_bonds,0);
804
/***********************************************************************
805
search for possible probes
806
***********************************************************************/
807
inline int ptnd_check_tprobe(PT_pdc *pdc, const char *probe, int len)
809
int occ[PT_B_MAX] = { 0, 0, 0, 0, 0, 0 };
813
occ[safeCharIndex(probe[count++])]++;
815
int gc = occ[PT_G]+occ[PT_C];
816
int at = occ[PT_A]+occ[PT_T];
820
if (all < len) return 1; // there were some unwanted characters ('N' or '.')
821
if (all < pdc->probelen) return 1;
823
if (gc < pdc->min_gc*len) return 1;
824
if (gc > pdc->max_gc*len) return 1;
826
double temp = at*2 + gc*4;
827
if (temp< pdc->mintemp) return 1;
828
if (temp>pdc->maxtemp) return 1;
833
static long ptnd_build_probes_collect(const char *probe, long count, void*) {
835
if (count >= ptnd.locs->group_count*ptnd.pdc->mintarget) {
836
tprobe = create_PT_tprobes();
837
tprobe->sequence = strdup(probe);
838
tprobe->temp = pt_get_temperature(probe);
839
tprobe->groupsize = (int)count;
840
aisc_link(&ptnd.pdc->ptprobes, tprobe);
824
tprobe = tprobe_next;
845
inline void PT_incr_hash(GB_HASH *hash, char *sequence, int len) {
846
char c = sequence[len];
849
pt_assert(strlen(sequence) == (size_t)len);
851
GBS_incr_hash(hash, sequence);
856
static void ptnd_add_sequence_to_hash(PT_pdc *pdc, GB_HASH *hash, char *sequence, int seq_len, int probe_len, char *prefix, int prefix_len) {
858
if (*prefix) { // partition search, else very large hash tables (>60 mbytes)
859
for (pos = seq_len-probe_len; pos >= 0; pos--, sequence++) {
860
if ((*prefix!= *sequence || strncmp(sequence,prefix,prefix_len)) ) continue;
861
if (!ptnd_check_tprobe(pdc,sequence, probe_len)) {
862
PT_incr_hash(hash, sequence, probe_len);
867
for (pos = seq_len-probe_len; pos >= 0; pos--, sequence++) {
868
if (!ptnd_check_tprobe(pdc,sequence, probe_len)) {
869
PT_incr_hash(hash, sequence, probe_len);
875
static long ptnd_collect_hash(const char *key, long val, void *gb_hash) {
877
GBS_incr_hash((GB_HASH*)gb_hash, key);
881
#if defined(DEVEL_RALF)
882
static long avoid_hash_warning(const char *, long , void*) { return 0; }
885
static void ptnd_build_tprobes(PT_pdc *pdc, int group_count) {
886
int *group_idx = new int[group_count];
887
unsigned long datasize = 0; // of marked species/genes
888
unsigned long maxseqlength = 0; // of marked species/genes
890
// get group indices and count datasize of marked species
893
for (int name = 0; name < psg.data_count; name++) {
894
if(psg.data[name].is_group == 1) {
895
group_idx[used_idx++] = name; // store marked group indices
896
unsigned long size = psg.data[name].size;
898
if (datasize<size) datasize = ULONG_MAX; // avoid overflow!
899
if (size > maxseqlength) maxseqlength = size;
902
pt_assert(used_idx == group_count);
905
unsigned long outer_hash_size;
906
int partsize = 0; // no partitions
908
const unsigned long max_allowed_hash_size = 1000000; // approx.
909
const unsigned long max_allowed_tprobes = (unsigned long)(max_allowed_hash_size*0.66); // results in 66% fill rate for hash (which is much!)
911
// tests found about 5-8% tprobes (in relation to datasize) -> we use 10% here
912
unsigned long estimated_no_of_tprobes = (unsigned long)((datasize-pdc->probelen+1)*0.10+0.5);
914
while (estimated_no_of_tprobes > max_allowed_tprobes) {
916
estimated_no_of_tprobes /= 4; // 4 different bases
919
outer_hash_size = (unsigned long)(estimated_no_of_tprobes*1.5);
922
printf("marked=%i datasize=%lu partsize=%i estimated_no_of_tprobes=%lu outer_hash_size=%lu\n",
923
group_count, datasize, partsize, estimated_no_of_tprobes, outer_hash_size);
928
GBS_clear_hash_statistic_summary("outer");
931
const int hash_multiply = 4; // hash fill ratio is 1/hash_multiply
933
char partstring[256];
934
PT_init_base_string_counter(partstring,PT_A,partsize);
935
while (!PT_base_string_counter_eof(partstring)) {
937
fputs("partition='", stdout);
938
for (int i = 0; i<partsize; ++i) {
939
putchar("ACGT"[partstring[i]-PT_A]);
941
fputs("'\n", stdout);
944
GB_HASH *hash_outer = GBS_create_hash(outer_hash_size, GB_MIND_CASE); // count in how many groups/sequences the tprobe occurs (key = tprobe, value = counter)
947
GBS_clear_hash_statistic_summary("inner");
950
// for (name = 0; name < psg.data_count; name++) {
951
// if(psg.data[name].is_group != 1) continue;
952
for (int g = 0; g<group_count; ++g) {
953
int name = group_idx[g];
954
long possible_tprobes = psg.data[name].size-pdc->probelen+1;
955
GB_HASH *hash_one = GBS_create_hash(possible_tprobes*hash_multiply, GB_MIND_CASE); // count tprobe occurrences for one group/sequence
956
ptnd_add_sequence_to_hash(pdc, hash_one, psg.data[name].data, psg.data[name].size, pdc->probelen, partstring, partsize);
957
GBS_hash_do_loop(hash_one, ptnd_collect_hash, hash_outer); // merge hash_one into hash
959
GBS_calc_hash_statistic(hash_one, "inner", 0);
961
GBS_free_hash(hash_one);
964
for ( seq = pdc->sequences; seq; seq = seq->next) {
965
long possible_tprobes = seq->seq.size-pdc->probelen+1;
966
GB_HASH *hash_one = GBS_create_hash(possible_tprobes*hash_multiply, GB_MIND_CASE); // count tprobe occurrences for one group/sequence
967
ptnd_add_sequence_to_hash(pdc, hash_one, seq->seq.data, seq->seq.size, pdc->probelen, partstring, partsize);
968
GBS_hash_do_loop(hash_one, ptnd_collect_hash, hash_outer); // merge hash_one into hash
970
GBS_calc_hash_statistic(hash_one, "inner", 0);
972
GBS_free_hash(hash_one);
976
GBS_print_hash_statistic_summary("inner");
979
GBS_hash_do_loop(hash_outer, ptnd_build_probes_collect, NULL);
982
GBS_calc_hash_statistic(hash_outer, "outer", 1);
983
#if defined(DEVEL_RALF)
984
GBS_hash_do_loop(hash_outer, avoid_hash_warning, NULL); // hash is known to be overfilled - avoid assertion
988
GBS_free_hash(hash_outer);
989
PT_inc_base_string_count(partstring,PT_A,PT_B_MAX,partsize);
992
GBS_print_hash_statistic_summary("outer");
997
static void ptnd_print_probes(PT_pdc *pdc) {
1002
for ( tprobe = pdc->tprobes;
1004
tprobe = tprobe->next ) {
1005
strncpy(buffer,tprobe->sequence,250);
1007
PT_base_2_string(buffer,0);
1008
printf("seq: %s group %i mishits: %i ",buffer,tprobe->groupsize,tprobe->mishit);
1009
for (i=0;i<PERC_SIZE;i++) {
1010
printf("%3i,",tprobe->perc[i]);
1017
extern "C" int PT_start_design(PT_pdc *pdc, int /*dummy*/) {
1021
PT_local *locs = (PT_local*)pdc->mh.parent->parent;
1028
char *unknown_names = ptpd_read_names(locs, pdc->names.data, pdc->checksums.data, error); /* read the names */
1029
if (unknown_names) free(unknown_names); // unused here (handled by PT_unknown_names)
1033
pt_export_error(locs, error);
1038
for ( seq = pdc->sequences; seq; seq = seq->next) { // Convert all external sequence to internal format
1039
seq->seq.size = probe_compress_sequence(seq->seq.data, seq->seq.size);
1040
locs->group_count++;
1043
if (locs->group_count <=0) {
1044
pt_export_error(locs, GBS_global_string("No %s marked - no probes designed",
1045
gene_flag ? "genes" : "species"));
829
class CentigradePos { // track minimum centigrade position for each species
830
typedef std::map<int, int> Pos4Name;
832
Pos4Name cpos; // key = outgroup-species-id; value = min. centigrade position for (partial) probe
836
static bool is_valid_pos(int pos) { return pos >= 0 && pos<PERC_SIZE; }
838
void set_pos_for(int name, int pos) {
839
pt_assert(is_valid_pos(pos)); // otherwise it should not be put into CentigradePos
840
Pos4Name::iterator found = cpos.find(name);
841
if (found == cpos.end()) {
844
else if (pos<found->second) {
849
void summarize_centigrade_hits(int *centigrade_hits) const {
850
for (int i = 0; i<PERC_SIZE; ++i) centigrade_hits[i] = 0;
851
for (Pos4Name::const_iterator p = cpos.begin(); p != cpos.end(); ++p) {
853
pt_assert(is_valid_pos(pos)); // otherwise it should not be put into CentigradePos
854
centigrade_hits[pos]++;
858
bool empty() const { return cpos.empty(); }
861
class OutgroupMatcher : virtual Noncopyable {
862
const PT_pdc *const pdc;
865
double max_bonds[PT_BASES]; // @@@ move functionality into Splits
867
PT_tprobes *currTprobe;
868
CentigradePos result;
870
bool only_bind_behind_dot; // true for suffix-matching
872
void uncond_announce_match(int centPos, const AbsLoc& loc) {
873
pt_assert(loc.get_pid().outside_group());
874
#if defined(DUMP_OLIGO_MATCHING)
875
fprintf(stderr, "announce_match centPos=%2i loc", centPos);
879
result.set_pos_for(loc.get_name(), centPos);
882
bool location_follows_dot(const ReadableDataLoc& loc) const { return loc[-1] == PT_QU; }
883
bool location_follows_dot(const DataLoc& loc) const { return location_follows_dot(ReadableDataLoc(loc)); } // very expensive @@@ optimize using relpos
884
bool location_follows_dot(const AbsLoc& loc) const { return location_follows_dot(ReadableDataLoc(DataLoc(loc))); } // very very expensive
886
template<typename LOC>
887
bool acceptable_location(const LOC& loc) const {
888
return loc.get_pid().outside_group() &&
889
(only_bind_behind_dot ? location_follows_dot(loc) : true);
892
template<typename LOC>
893
void announce_match_if_acceptable(int centPos, const LOC& loc) {
894
if (acceptable_location(loc)) {
895
uncond_announce_match(centPos, loc);
898
void announce_match_if_acceptable(int centPos, POS_TREE2 *pt) {
899
switch (pt->get_type()) {
901
for (int i = PT_QU; i<PT_BASES; ++i) {
902
POS_TREE2 *ptson = PT_read_son(pt, (PT_base)i);
904
announce_match_if_acceptable(centPos, ptson);
910
announce_match_if_acceptable(centPos, DataLoc(pt));
914
ChainIteratorStage2 iter(pt);
916
announce_match_if_acceptable(centPos, iter.at());
924
template <typename PT_OR_LOC>
925
void announce_possible_match(const MatchingOligo& oligo, PT_OR_LOC pt_or_loc) {
926
pt_assert(oligo.domainSeen());
927
pt_assert(!oligo.dangling());
929
int centPos = oligo.calc_centigrade_pos(currTprobe, pdc);
930
if (centPos<PERC_SIZE) {
931
announce_match_if_acceptable(centPos, pt_or_loc);
935
bool might_reach_centigrade_pos(const MatchingOligo& oligo) const {
936
return !oligo.centigrade_pos_out_of_reach(currTprobe, pdc, splits, max_bonds);
939
void bind_rest(const MatchingOligo& oligo, const ReadableDataLoc& loc, const int height) {
940
// unconditionally bind rest of oligo versus sequence
942
pt_assert(oligo.domainSeen()); // entry-invariant for bind_rest()
943
pt_assert(oligo.dangling()); // otherwise ReadableDataLoc is constructed w/o need (very expensive!)
944
pt_assert(might_reach_centigrade_pos(oligo)); // otherwise ReadableDataLoc is constructed w/o need (very expensive!)
945
pt_assert(loc.get_pid().outside_group()); // otherwise we are not interested in the result
948
MatchingOligo more = oligo.bind_against(loc[height], splits, max_bonds);
949
pt_assert(more.domainSeen()); // implied by oligo.domainSeen()
950
if (more.dangling()) {
951
if (might_reach_centigrade_pos(more)) {
952
bind_rest(more, loc, height+1);
956
announce_possible_match(more, loc);
960
MatchingOligo all = oligo.dont_bind_rest();
961
announce_possible_match(all, loc);
964
void bind_rest_if_outside_group(const MatchingOligo& oligo, const DataLoc& loc, const int height) {
965
if (loc.get_pid().outside_group() && might_reach_centigrade_pos(oligo)) {
966
bind_rest(oligo, ReadableDataLoc(loc), height);
969
void bind_rest_if_outside_group(const MatchingOligo& oligo, const AbsLoc& loc, const int height) {
970
if (loc.get_pid().outside_group() && might_reach_centigrade_pos(oligo)) {
971
bind_rest(oligo, ReadableDataLoc(DataLoc(loc)), height);
975
void bind_rest(const MatchingOligo& oligo, POS_TREE2 *pt, const int height) {
976
// unconditionally bind rest of oligo versus complete index-subtree
977
pt_assert(oligo.domainSeen()); // entry-invariant for bind_rest()
979
if (oligo.dangling()) {
980
if (might_reach_centigrade_pos(oligo)) {
981
switch (pt->get_type()) {
983
POS_TREE2 *ptdotson = PT_read_son(pt, PT_QU);
985
MatchingOligo all = oligo.dont_bind_rest();
986
announce_possible_match(all, ptdotson);
989
for (int i = PT_A; i<PT_BASES; ++i) {
990
POS_TREE2 *ptson = PT_read_son(pt, (PT_base)i);
992
bind_rest(oligo.bind_against(i, splits, max_bonds), ptson, height+1);
998
bind_rest_if_outside_group(oligo, DataLoc(pt), height);
1002
ChainIteratorStage2 iter(pt);
1004
bind_rest_if_outside_group(oligo, iter.at(), height);
1012
announce_possible_match(oligo, pt);
1016
void bind_till_domain(const MatchingOligo& oligo, const ReadableDataLoc& loc, const int height) {
1017
pt_assert(!oligo.domainSeen() && oligo.domainPossible()); // entry-invariant for bind_till_domain()
1018
pt_assert(oligo.dangling()); // otherwise ReadableDataLoc is constructed w/o need (very expensive!)
1019
pt_assert(might_reach_centigrade_pos(oligo)); // otherwise ReadableDataLoc is constructed w/o need (very expensive!)
1020
pt_assert(loc.get_pid().outside_group()); // otherwise we are not interested in the result
1022
if (is_std_base(loc[height])) { // do not try to bind domain versus dot or N
1023
MatchingOligo more = oligo.bind_against(loc[height], splits, max_bonds);
1024
if (more.dangling()) {
1025
if (might_reach_centigrade_pos(more)) {
1026
if (more.domainSeen()) bind_rest (more, loc, height+1);
1027
else if (more.domainPossible()) bind_till_domain(more, loc, height+1);
1030
else if (more.domainSeen()) {
1031
announce_possible_match(more, loc);
1035
void bind_till_domain_if_outside_group(const MatchingOligo& oligo, const DataLoc& loc, const int height) {
1036
if (loc.get_pid().outside_group() && might_reach_centigrade_pos(oligo)) {
1037
bind_till_domain(oligo, ReadableDataLoc(loc), height);
1040
void bind_till_domain_if_outside_group(const MatchingOligo& oligo, const AbsLoc& loc, const int height) {
1041
if (loc.get_pid().outside_group() && might_reach_centigrade_pos(oligo)) {
1042
bind_till_domain(oligo, ReadableDataLoc(DataLoc(loc)), height);
1046
void bind_till_domain(const MatchingOligo& oligo, POS_TREE2 *pt, const int height) {
1047
pt_assert(!oligo.domainSeen() && oligo.domainPossible()); // entry-invariant for bind_till_domain()
1048
pt_assert(oligo.dangling());
1050
if (might_reach_centigrade_pos(oligo)) {
1051
switch (pt->get_type()) {
1053
for (int i = PT_A; i<PT_BASES; ++i) {
1054
POS_TREE2 *ptson = PT_read_son(pt, (PT_base)i);
1056
MatchingOligo sonOligo = oligo.bind_against(i, splits, max_bonds);
1058
if (sonOligo.domainSeen()) bind_rest (sonOligo, ptson, height+1);
1059
else if (sonOligo.domainPossible()) bind_till_domain(sonOligo, ptson, height+1);
1065
bind_till_domain_if_outside_group(oligo, DataLoc(pt), height);
1069
ChainIteratorStage2 iter(pt);
1071
bind_till_domain_if_outside_group(oligo, iter.at(), height);
1079
void reset() { result = CentigradePos(); }
1082
OutgroupMatcher(PT_local *locs_, PT_pdc *pdc_)
1086
only_bind_behind_dot(false)
1088
for (int i = PT_QU; i<PT_BASES; ++i) {
1089
max_bonds[i] = ptnd_check_max_bond(locs_, i);
1093
void calculate_outgroup_matches(PT_tprobes& tprobe) {
1094
LocallyModify<PT_tprobes*> assign_tprobe(currTprobe, &tprobe);
1095
pt_assert(result.empty());
1097
MatchingOligo fullProbe(Oligo(tprobe.sequence, tprobe.seq_len));
1098
POS_TREE2 *ptroot = psg.TREE_ROOT2();
1100
pt_assert(!fullProbe.domainSeen());
1101
pt_assert(fullProbe.domainPossible()); // probe length too short (probes shorter than DOMAIN_MIN_LENGTH always report 0 outgroup hits -> wrong result)
1103
arb_progress progress(tprobe.seq_len);
1105
only_bind_behind_dot = false;
1106
bind_till_domain(fullProbe, ptroot, 0);
1109
// match all suffixes of probe
1110
// - detect possible partial matches at start of alignment (and behind dots inside the sequence)
1111
only_bind_behind_dot = true;
1112
for (int off = 1; off<tprobe.seq_len; ++off) {
1113
MatchingOligo probeSuffix(fullProbe.get_oligo().suffix(off));
1114
if (!probeSuffix.domainPossible()) break; // abort - no smaller suffix may create a domain
1115
bind_till_domain(probeSuffix, ptroot, 0);
1119
result.summarize_centigrade_hits(tprobe.perc);
1126
class ProbeOccurrance {
1130
ProbeOccurrance(const char *seq_) : seq(seq_) {}
1132
const char *sequence() const { return seq; }
1134
void forward() { ++seq; }
1136
bool less(const ProbeOccurrance& other, size_t len) const {
1137
const char *mine = sequence();
1138
const char *his = other.sequence();
1141
for (size_t i = 0; i<len && !cmp; ++i) {
1142
cmp = mine[i]-his[i];
1148
class ProbeIterator {
1149
ProbeOccurrance cursor;
1152
size_t rest; // size of seqpart behind cursor
1153
size_t gc, at; // count G+C and A+T
1155
bool has_only_std_bases() const { return (gc+at) == probelen; }
1157
bool at_end() const { return !rest; }
1159
PT_base base_at(size_t offset) const {
1160
pt_assert(offset < probelen);
1161
return PT_base(cursor.sequence()[offset]);
1164
void forget(PT_base b) {
1166
case PT_A: case PT_T: pt_assert(at>0); --at; break;
1167
case PT_C: case PT_G: pt_assert(gc>0); --gc; break;
1171
void count(PT_base b) {
1173
case PT_A: case PT_T: ++at; break;
1174
case PT_C: case PT_G: ++gc; break;
1179
void init_counts() {
1181
for (size_t i = 0; i<probelen; ++i) {
1187
ProbeIterator(const char *seq, size_t seqlen, size_t probelen_)
1189
probelen(probelen_),
1190
rest(seqlen-probelen)
1192
pt_assert(seqlen >= probelen);
1197
if (at_end()) return false;
1201
count(base_at(probelen-1));
1205
int get_temperature() const { return 2*at + 4*gc; }
1207
bool gc_in_wanted_range(PT_pdc *pdc) const {
1208
return pdc->min_gc*probelen <= gc && gc <= pdc->max_gc*probelen;
1210
bool temperature_in_wanted_range(PT_pdc *pdc) const {
1211
int temp = get_temperature();
1212
return pdc->mintemp <= temp && temp <= pdc->maxtemp;
1215
bool feasible(PT_pdc *pdc) const {
1216
return has_only_std_bases() &&
1217
gc_in_wanted_range(pdc) &&
1218
temperature_in_wanted_range(pdc);
1221
const ProbeOccurrance& occurance() const { return cursor; }
1223
void dump(FILE *out) const {
1224
char *probe = readable_probe(cursor.sequence(), probelen, 'T');
1225
fprintf(out, "probe='%s' probelen=%zu at=%zu gc=%zu\n", probe, probelen, at, gc);
1233
PO_Less(size_t probelen_) : probelen(probelen_) {}
1234
bool operator()(const ProbeOccurrance& a, const ProbeOccurrance& b) const { return a.less(b, probelen); }
1238
bool operator()(const SmartCharPtr& p1, const SmartCharPtr& p2) { return &*p1 < &*p2; } // simply compare addresses
1241
class ProbeCandidates {
1242
typedef std::set<ProbeOccurrance, PO_Less> Candidates;
1243
typedef std::map<ProbeOccurrance, int, PO_Less> CandidateHits;
1244
typedef std::set<SmartCharPtr, SCP_Less> SeqData;
1247
CandidateHits candidateHits;
1248
SeqData data; // locks SmartPtrs to input data (ProbeOccurrances point inside their data)
1251
ProbeCandidates(size_t probelen_)
1252
: probelen(probelen_),
1253
candidateHits(probelen)
1256
void generate_for_sequence(PT_pdc *pdc, const SmartCharPtr& seq, size_t bp) {
1257
arb_progress base_progress(2*bp);
1258
if (bp >= probelen) {
1259
// collect all probe candidates for current sequence
1260
Candidates candidates(probelen);
1263
ProbeIterator probe(&*seq, bp, probelen);
1265
if (probe.feasible(pdc)) {
1266
candidates.insert(probe.occurance());
1269
} while (probe.next());
1272
// increment overall hitcount for each found candidate
1273
for (Candidates::iterator c = candidates.begin(); c != candidates.end(); ++c) {
1274
candidateHits[*c]++;
1278
base_progress.done();
1281
void create_tprobes(PT_pdc *pdc, int ingroup_size) {
1282
// tracks maximum rejected target-group coverage
1283
int min_ingroup_hits = (ingroup_size * pdc->min_coverage + .5);
1284
int max_rej_ingroup_hits = 0;
1286
for (CandidateHits::iterator c = candidateHits.begin(); c != candidateHits.end(); ++c) {
1287
int ingroup_hits = c->second;
1288
if (ingroup_hits >= min_ingroup_hits) {
1289
const ProbeOccurrance& candi = c->first;
1291
PT_tprobes *tprobe = create_PT_tprobes();
1293
tprobe->sequence = GB_strndup(candi.sequence(), probelen);
1294
tprobe->temp = pt_get_temperature(tprobe->sequence);
1295
tprobe->groupsize = ingroup_hits;
1297
aisc_link(&pdc->ptprobes, tprobe);
1300
max_rej_ingroup_hits = std::max(max_rej_ingroup_hits, ingroup_hits);
1304
double max_rejected_coverage = max_rej_ingroup_hits/double(ingroup_size);
1305
pdc->max_rj_coverage = std::max(pdc->max_rj_coverage, max_rejected_coverage);
1309
class DesignTargets : virtual Noncopyable {
1312
int targets; // overall target sequence count
1313
int known_targets; // known target sequences
1314
int added_targets; // added (=unknown) target sequences
1316
long datasize; // overall bp of target sequences
1318
int min_probe_length; // min. designed probe length
1324
void announce_seq_bp(long bp) {
1326
if (bp<long(min_probe_length)) {
1327
error = GBS_global_string("Sequence contains only %lu bp. Impossible design request for", bp);
1331
if (datasize<bp) datasize = ULONG_MAX;
1335
void scan_added_targets() {
1337
for (PT_sequence *seq = pdc->sequences; seq && !error; seq = seq->next) {
1338
announce_seq_bp(seq->seq.size);
1341
if (error) error = GBS_global_string("%s one of the added sequences", error.deliver());
1343
void scan_known_targets(int expected_known_targets) {
1344
known2id = new int[expected_known_targets];
1347
for (int id = 0; id < psg.data_count && !error; ++id) {
1348
const probe_input_data& pid = psg.data[id];
1349
if (pid.inside_group()) {
1350
announce_seq_bp(pid.get_size());
1351
known2id[known_targets++] = id;
1352
pt_assert(known_targets <= expected_known_targets);
1353
if (error) error = GBS_global_string("%s species '%s'", error.deliver(), pid.get_shortname());
1359
DesignTargets(PT_pdc *pdc_, int expected_known_targets)
1364
min_probe_length(pdc->min_probelen),
1367
scan_added_targets(); // calc added_targets
1369
scan_known_targets(expected_known_targets);
1370
targets = known_targets+added_targets;
1376
ARB_ERROR& get_error() { return error; } // needs to be checked after construction!
1378
long get_datasize() const { return datasize; }
1379
int get_count() const { return targets; }
1380
int get_known_count() const { return known_targets; }
1381
int get_added_count() const { return added_targets; }
1383
void generate(ProbeCandidates& candidates) {
1384
arb_progress progress(get_count());
1385
for (int k = 0; k<known_targets; ++k) {
1386
const probe_input_data& pid = psg.data[known2id[k]];
1387
candidates.generate_for_sequence(pdc, pid.get_dataPtr(), pid.get_size());
1390
for (PT_sequence *seq = pdc->sequences; seq; seq = seq->next) {
1391
candidates.generate_for_sequence(pdc, GB_strndup(seq->seq.data, seq->seq.size), seq->seq.size);
1397
#if defined(DUMP_DESIGNED_PROBES)
1398
static void dump_tprobe(PT_tprobes *tprobe, int idx) {
1399
char *readable = readable_probe(tprobe->sequence, tprobe->seq_len, 'T');
1400
fprintf(stderr, "tprobe='%s' idx=%i len=%i\n", readable, idx, tprobe->seq_len);
1403
static void DUMP_TPROBES(const char *where, PT_pdc *pdc) {
1405
fprintf(stderr, "dumping tprobes %s:\n", where);
1406
for (PT_tprobes *tprobe = pdc->tprobes; tprobe; tprobe = tprobe->next) {
1407
dump_tprobe(tprobe, idx++);
1411
#define DUMP_TPROBES(a,b)
1414
int PT_start_design(PT_pdc *pdc, int /* dummy */) {
1415
PT_local *locs = (PT_local*)pdc->mh.parent->parent;
1417
erase_formatter(pdc);
1049
1418
while (pdc->tprobes) destroy_PT_tprobes(pdc->tprobes);
1050
ptnd_build_tprobes(pdc, locs->group_count);
1051
while (pdc->sequences) destroy_PT_sequence(pdc->sequences);
1052
ptnd_sort_probes_by(pdc,1);
1053
ptnd_first_check(pdc);
1054
ptnd_check_position(pdc);
1055
ptnd_check_bonds(pdc,ptnd.new_match);
1056
ptnd_cp_tprobe_2_probepart(pdc);
1057
ptnd_duplikate_probepart(pdc);
1058
ptnd_sort_parts(pdc);
1059
ptnd_remove_duplikated_probepart(pdc);
1060
ptnd_check_probepart(pdc);
1061
while (pdc->parts) destroy_PT_probeparts(pdc->parts);
1062
ptnd_calc_quality(pdc);
1063
ptnd_sort_probes_by(pdc,0);
1064
ptnd_probe_delete_all_but(pdc,pdc->clipresult);
1065
/* ptnd_print_probes(pdc); */
1420
for (PT_sequence *seq = pdc->sequences; seq; seq = seq->next) { // Convert all external sequence to internal format
1421
seq->seq.size = probe_compress_sequence(seq->seq.data, seq->seq.size-1); // no longer convert final zero-byte (PT_QU gets auto-appended)
1425
int unknown_species_count = 0;
1427
char *unknown_names = ptpd_read_names(locs, pdc->names.data, pdc->checksums.data, error); // read the names
1428
if (unknown_names) {
1429
ConstStrArray names;
1430
GBT_splitNdestroy_string(names, unknown_names, "#", true);
1431
pt_assert(!unknown_names);
1432
unknown_species_count = names.size();
1437
DesignTargets targets(pdc, locs->group_count); // here locs->group_count is amount of known marked species/genes
1438
error = targets.get_error();
1441
locs->group_count = targets.get_count();
1442
if (locs->group_count <= 0) {
1443
error = GBS_global_string("No %s marked - no probes designed", gene_flag ? "genes" : "species");
1445
else if (targets.get_added_count() != unknown_species_count) {
1446
if (gene_flag) { // cannot add genes
1447
pt_assert(targets.get_added_count() == 0); // cannot, but did add?!
1448
error = GBS_global_string("Cannot design probes for %i unknown marked genes", unknown_species_count);
1451
int added = targets.get_added_count();
1452
error = GBS_global_string("Got %i unknown marked species, but %i custom sequence%s added (has to match)",
1453
unknown_species_count,
1455
added == 1 ? " was" : "s were");
1458
else if (pdc->min_probelen < MIN_DESIGN_PROBE_LENGTH) {
1459
error = GBS_global_string("Specified min. probe length %i is below the min. allowed probe length of %i", pdc->min_probelen, MIN_DESIGN_PROBE_LENGTH);
1461
else if (get_max_probelen(pdc) < pdc->min_probelen) {
1462
error = GBS_global_string("Max. probe length %i is below the specified min. probe length of %i", get_max_probelen(pdc), pdc->min_probelen);
1465
// search for possible probes
1467
int min_probelen = pdc->min_probelen;
1468
int max_probelen = get_max_probelen(pdc);
1470
arb_progress progress("Searching probe candidates", max_probelen-min_probelen+1);
1471
for (int len = min_probelen; len <= max_probelen; ++len) {
1472
ProbeCandidates candidates(len);
1474
targets.generate(candidates);
1475
pt_assert(!targets.get_error());
1477
candidates.create_tprobes(pdc, targets.get_count());
1482
while (pdc->sequences) destroy_PT_sequence(pdc->sequences);
1485
sort_tprobes_by(pdc, PSM_SEQUENCE);
1486
remove_tprobes_with_too_many_mishits(pdc);
1487
remove_tprobes_outside_ecoli_range(pdc);
1488
size_t tprobes = tprobes_calculate_bonds(locs);
1489
DUMP_TPROBES("after tprobes_calculate_bonds", pdc);
1492
arb_progress progress(GBS_global_string("Calculating probe quality for %zu probe candidates", tprobes), tprobes);
1493
OutgroupMatcher om(locs, pdc);
1494
for (PT_tprobes *tprobe = pdc->tprobes; tprobe; tprobe = tprobe->next) {
1495
om.calculate_outgroup_matches(*tprobe);
1500
fputs("No probe candidates found\n", stdout);
1503
tprobes_sumup_perc_and_calc_quality(pdc);
1504
sort_tprobes_by(pdc, PSM_QUALITY);
1505
clip_tprobes(pdc, pdc->clipresult);
1511
pt_export_error_if(locs, error);
1070
void ptnd_new_match(PT_local * locs, char *probestring)
1072
PT_pdc *pdc = locs->pdc;
1079
if (!pdc) return; /* no config */
1081
tprobe = create_PT_tprobes();
1082
tprobe->sequence = strdup(probestring);
1084
aisc_link(&pdc->ptprobes, tprobe);
1085
ptnd_check_bonds(pdc,ptnd.new_match);
1086
ptnd_cp_tprobe_2_probepart(pdc);
1087
ptnd_duplikate_probepart(pdc);
1088
ptnd_sort_parts(pdc);
1089
ptnd_remove_duplikated_probepart(pdc);
1090
ptnd_check_probepart(pdc);
1092
while (pdc->parts) destroy_PT_probeparts(pdc->parts);
1093
while (( tprobe = pdc->tprobes )) destroy_PT_tprobes(tprobe);
1515
// --------------------------------------------------------------------------------
1519
#include <test_unit.h>
1522
static const char *concat_iteration(PrefixIterator& prefix) {
1523
static GBS_strstruct out(50);
1527
while (!prefix.done()) {
1528
if (out.filled()) out.put(',');
1530
char *readable = probe_2_readable(prefix.copy(), prefix.length());
1537
return out.get_data();
1540
void TEST_PrefixIterator() {
1541
// straight-forward permutation
1542
PrefixIterator p0(PT_A, PT_T, 0); TEST_EXPECT_EQUAL(p0.steps(), 1);
1543
PrefixIterator p1(PT_A, PT_T, 1); TEST_EXPECT_EQUAL(p1.steps(), 4);
1544
PrefixIterator p2(PT_A, PT_T, 2); TEST_EXPECT_EQUAL(p2.steps(), 16);
1545
PrefixIterator p3(PT_A, PT_T, 3); TEST_EXPECT_EQUAL(p3.steps(), 64);
1547
TEST_EXPECT_EQUAL(p1.done(), false);
1548
TEST_EXPECT_EQUAL(p0.done(), false);
1550
TEST_EXPECT_EQUAL(concat_iteration(p0), "");
1551
TEST_EXPECT_EQUAL(concat_iteration(p1), "A,C,G,U");
1552
TEST_EXPECT_EQUAL(concat_iteration(p2), "AA,AC,AG,AU,CA,CC,CG,CU,GA,GC,GG,GU,UA,UC,UG,UU");
1554
// permutation truncated at PT_QU
1555
PrefixIterator q0(PT_QU, PT_T, 0); TEST_EXPECT_EQUAL(q0.steps(), 1);
1556
PrefixIterator q1(PT_QU, PT_T, 1); TEST_EXPECT_EQUAL(q1.steps(), 6);
1557
PrefixIterator q2(PT_QU, PT_T, 2); TEST_EXPECT_EQUAL(q2.steps(), 31);
1558
PrefixIterator q3(PT_QU, PT_T, 3); TEST_EXPECT_EQUAL(q3.steps(), 156);
1559
PrefixIterator q4(PT_QU, PT_T, 4); TEST_EXPECT_EQUAL(q4.steps(), 781);
1561
TEST_EXPECT_EQUAL(concat_iteration(q0), "");
1562
TEST_EXPECT_EQUAL(concat_iteration(q1), ".,N,A,C,G,U");
1563
TEST_EXPECT_EQUAL(concat_iteration(q2),
1565
"N.,NN,NA,NC,NG,NU,"
1566
"A.,AN,AA,AC,AG,AU,"
1567
"C.,CN,CA,CC,CG,CU,"
1568
"G.,GN,GA,GC,GG,GU,"
1569
"U.,UN,UA,UC,UG,UU");
1570
TEST_EXPECT_EQUAL(concat_iteration(q3),
1573
"NN.,NNN,NNA,NNC,NNG,NNU,"
1574
"NA.,NAN,NAA,NAC,NAG,NAU,"
1575
"NC.,NCN,NCA,NCC,NCG,NCU,"
1576
"NG.,NGN,NGA,NGC,NGG,NGU,"
1577
"NU.,NUN,NUA,NUC,NUG,NUU,"
1579
"AN.,ANN,ANA,ANC,ANG,ANU,"
1580
"AA.,AAN,AAA,AAC,AAG,AAU,"
1581
"AC.,ACN,ACA,ACC,ACG,ACU,"
1582
"AG.,AGN,AGA,AGC,AGG,AGU,"
1583
"AU.,AUN,AUA,AUC,AUG,AUU,"
1585
"CN.,CNN,CNA,CNC,CNG,CNU,"
1586
"CA.,CAN,CAA,CAC,CAG,CAU,"
1587
"CC.,CCN,CCA,CCC,CCG,CCU,"
1588
"CG.,CGN,CGA,CGC,CGG,CGU,"
1589
"CU.,CUN,CUA,CUC,CUG,CUU,"
1591
"GN.,GNN,GNA,GNC,GNG,GNU,"
1592
"GA.,GAN,GAA,GAC,GAG,GAU,"
1593
"GC.,GCN,GCA,GCC,GCG,GCU,"
1594
"GG.,GGN,GGA,GGC,GGG,GGU,"
1595
"GU.,GUN,GUA,GUC,GUG,GUU,"
1597
"UN.,UNN,UNA,UNC,UNG,UNU,"
1598
"UA.,UAN,UAA,UAC,UAG,UAU,"
1599
"UC.,UCN,UCA,UCC,UCG,UCU,"
1600
"UG.,UGN,UGA,UGC,UGG,UGU,"
1601
"UU.,UUN,UUA,UUC,UUG,UUU");
1604
#endif // UNIT_TESTS
1606
// --------------------------------------------------------------------------------