1
// =============================================================== //
3
// File : ps_detect_weak_differences.cxx //
6
// Coded by Wolfram Foerster in October 2002 //
7
// Institute of Microbiology (Technical University Munich) //
8
// http://www.arb-home.de/ //
10
// =============================================================== //
1
12
#include "ps_database.hxx"
2
13
#include "ps_bitmap.hxx"
3
14
#include "ps_tools.hxx"
10
17
#include <sys/times.h>
15
PS_BitMap_Fast *__MAP;
20
static SpeciesID __MAX_ID;
21
static SpeciesID __MIN_ID;
22
static PS_BitMap_Fast *__MAP;
16
23
// globals for PS_detect_weak_differences
18
IDVector *__INVERSE_PATH;
19
unsigned long int __COUNT_SET_OPS = 0;
20
unsigned long int __COUNT_SET_OPS2 = 0;
24
static IDVector *__PATH;
25
static IDVector *__INVERSE_PATH;
26
static unsigned long int __COUNT_SET_OPS = 0;
27
static unsigned long int __COUNT_SET_OPS2 = 0;
28
static char *__NODES_LEFT;
22
29
// globals for PS_print_and_evaluate_map
24
IDID2IDSetMap *__PAIR2PATH;
25
SpeciesID __ONEMATCH_MIN_ID;
26
SpeciesID __ONEMATCH_MAX_ID;
28
void PS_print_path() {
29
printf( "__PATH %3zu :",__PATH->size() );
31
for (IDVectorCIter i = __PATH->begin(); i != __PATH->end(); ++i,++c) {
32
if (c % 20 == 0) printf( "\n" );
38
void PS_print_inverse_path() {
39
printf( "__INVERSE_PATH %3zu :",__INVERSE_PATH->size() );
41
for (IDVectorCIter i = __INVERSE_PATH->begin(); i != __INVERSE_PATH->end(); ++i,++c) {
42
if (c % 20 == 0) printf( "\n" );
49
void PS_detect_weak_differences_stepdown( const PS_NodePtr _ps_node,
50
const SpeciesID _parent_ID,
30
static IDSet *__PATHSET;
31
static IDID2IDSetMap *__PAIR2PATH;
32
static SpeciesID __ONEMATCH_MIN_ID;
33
static SpeciesID __ONEMATCH_MAX_ID;
36
void PS_debug_print_path() {
37
printf("__PATH %3zu :", __PATH->size());
39
for (IDVectorCIter i = __PATH->begin(); i != __PATH->end(); ++i, ++c) {
40
if (c % 20 == 0) printf("\n");
46
void PS_debug_print_inverse_path() {
47
printf("__INVERSE_PATH %3zu :", __INVERSE_PATH->size());
49
for (IDVectorCIter i = __INVERSE_PATH->begin(); i != __INVERSE_PATH->end(); ++i, ++c) {
50
if (c % 20 == 0) printf("\n");
57
static void PS_detect_weak_differences_stepdown(const PS_NodePtr& _ps_node,
58
const SpeciesID _parent_ID,
52
60
// Recursively walk through tree and make a bool-matrix of SpeciesID's
53
61
// where true means that the 2 species can be distinguished by a probe.
95
100
// inverse path loop (explicit)
96
101
for (IDVectorCIter it_inverse_path = __INVERSE_PATH->begin();
97
102
it_inverse_path != __INVERSE_PATH->end();
99
104
inverse_path_ID = *it_inverse_path;
101
__MAP->setTrue( path_ID, inverse_path_ID );
102
__MAP->setTrue( inverse_path_ID, path_ID );
106
__MAP->setTrue(path_ID, inverse_path_ID);
107
__MAP->setTrue(inverse_path_ID, path_ID);
105
110
// inverse path loop (implicit)
106
111
for (inverse_path_ID = id+1; inverse_path_ID < __MAX_ID; ++inverse_path_ID) { // skip to id ABOVE current node id
107
__MAP->setTrue( path_ID, inverse_path_ID );
108
__MAP->setTrue( inverse_path_ID, path_ID );
112
__MAP->setTrue(path_ID, inverse_path_ID);
113
__MAP->setTrue(inverse_path_ID, path_ID);
113
// PS_print_inverse_path();
114
118
unsigned long int set_ops = __PATH->size()*(__MAX_ID-id-1+__INVERSE_PATH->size());
115
119
if (ULONG_MAX - __COUNT_SET_OPS < set_ops) {
116
120
set_ops = set_ops - (ULONG_MAX - __COUNT_SET_OPS);
130
134
// inverse path loop (explicit)
131
135
for (IDVectorCIter it_inverse_path = __INVERSE_PATH->begin();
132
136
it_inverse_path != __INVERSE_PATH->end();
133
++it_inverse_path ) {
134
138
inverse_path_ID = *it_inverse_path;
135
139
smaller_ID = (path_ID < inverse_path_ID) ? path_ID : inverse_path_ID;
136
140
bigger_ID = (path_ID > inverse_path_ID) ? path_ID : inverse_path_ID;
138
if (__MAP->get( smaller_ID, bigger_ID )) {
139
__MAP->setTrue( bigger_ID, smaller_ID );
141
__MAP->setTrue( smaller_ID, bigger_ID );
142
if (__MAP->get(smaller_ID, bigger_ID)) {
143
__MAP->setTrue(bigger_ID, smaller_ID);
146
__MAP->setTrue(smaller_ID, bigger_ID);
160
166
// step down the children
162
168
int c = _ps_node->countChildren()-1;
163
for ( PS_NodeMapConstIterator i = _ps_node->getChildrenBegin();
169
for (PS_NodeMapConstIterator i = _ps_node->getChildrenBegin();
164
170
i != _ps_node->getChildrenEnd();
166
172
if (_depth < 60) {
168
__NODES_LEFT[ _depth ] = '0'+c;
170
__NODES_LEFT[ _depth ] = '+';
174
__NODES_LEFT[_depth] = '0'+c;
177
__NODES_LEFT[_depth] = '+';
173
PS_detect_weak_differences_stepdown( i->second, id, _depth+1 );
180
PS_detect_weak_differences_stepdown(i->second, id, _depth+1);
175
if (_depth < 60) __NODES_LEFT[ _depth ] = ' ';
182
if (_depth < 60) __NODES_LEFT[_depth] = ' ';
178
185
// remove IDs from paths
194
201
struct tms before;
196
203
struct tms before_first_level_node;
197
for (PS_NodeMapConstIterator i = _root_node->getChildrenBegin(); i != _root_node->getChildrenEnd(); ++i,++c ) {
204
for (PS_NodeMapConstIterator i = _root_node->getChildrenBegin(); i != _root_node->getChildrenEnd(); ++i, ++c) {
198
205
if (_root_node->countChildren()-c-1 < 10) {
199
206
__NODES_LEFT[0] = '0'+_root_node->countChildren()-c-1;
201
209
__NODES_LEFT[0] = '+';
203
211
if ((c < 50) || (c % 100 == 0)) {
204
times( &before_first_level_node );
205
printf( "PS_detect_weak_differences_stepdown( %i ) : %i. of %zu ", i->first, c+1, _root_node->countChildren() ); fflush( stdout );
212
times(&before_first_level_node);
213
printf("PS_detect_weak_differences_stepdown( %i ) : %i. of %zu ", i->first, c+1, _root_node->countChildren()); fflush(stdout);
207
PS_detect_weak_differences_stepdown( i->second, -1, 1 );
215
PS_detect_weak_differences_stepdown(i->second, -1, 1);
208
216
if ((c < 50) || (c % 100 == 0)) {
209
PS_print_time_diff( &before_first_level_node, "this node ", " " );
210
PS_print_time_diff( &before, "total ", "\n" );
217
PS_print_time_diff(&before_first_level_node, "this node ", " ");
218
PS_print_time_diff(&before, "total ", "\n");
213
printf( "%lu * %lu + %lu set operations performed\n", __COUNT_SET_OPS2, ULONG_MAX, __COUNT_SET_OPS ); fflush( stdout );
221
printf("%lu * %lu + %lu set operations performed\n", __COUNT_SET_OPS2, ULONG_MAX, __COUNT_SET_OPS); fflush(stdout);
216
224
delete __INVERSE_PATH;
219
typedef map<ID2IDPair,PS_NodePtr> IDID2NodeMap;
227
typedef map<ID2IDPair, PS_NodePtr> IDID2NodeMap;
220
228
typedef IDID2NodeMap::iterator IDID2NodeMapIter;
221
229
typedef IDID2NodeMap::const_iterator IDID2NodeMapCIter;
223
void PS_find_probes_for_pairs( const PS_NodePtr _ps_node, ID2IDSet &_pairs ) {
231
void PS_find_probes_for_pairs(const PS_NodePtr& _ps_node, ID2IDSet &_pairs) {
224
232
SpeciesID id = _ps_node->getNum();
225
233
bool has_probes = _ps_node->hasProbes();
228
236
// append ID to path
230
__PATHSET->insert( id );
238
__PATHSET->insert(id);
233
// dont look at path until ID is greater than lowest ID in the set of ID-pairs
241
// don't look at path until ID is greater than lowest ID in the set of ID-pairs
235
243
if ((id >= __ONEMATCH_MIN_ID) && has_probes) {
236
244
for (ID2IDSetCIter pair=_pairs.begin(); pair != _pairs.end(); ++pair) {
237
245
// look for pair-IDs in the path
238
bool found_first = __PATHSET->find( pair->first ) != __PATHSET->end();
239
bool found_second = __PATHSET->find( pair->second ) != __PATHSET->end();
246
bool found_first = __PATHSET->find(pair->first) != __PATHSET->end();
247
bool found_second = __PATHSET->find(pair->second) != __PATHSET->end();
240
248
if (found_first ^ found_second) { // ^ is XOR
241
printf( "found path for (%i,%i) at %p ", pair->first, pair->second,&(*_ps_node) );
249
printf("found path for (%i,%i) at %p ", pair->first, pair->second, &(*_ps_node));
242
250
_ps_node->printOnlyMe();
243
(*__PAIR2PATH)[ *pair ] = *__PATHSET; // store path
244
_pairs.erase( pair ); // remove found pair
245
// scan pairs for new min,max IDs
246
if ((pair->first == __ONEMATCH_MIN_ID) || (pair->second == __ONEMATCH_MAX_ID)) {
251
(*__PAIR2PATH)[*pair] = *__PATHSET; // store path
253
bool scanMinMax = (pair->first == __ONEMATCH_MIN_ID) || (pair->second == __ONEMATCH_MAX_ID);
254
_pairs.erase(pair); // remove found pair
256
// scan pairs for new min,max IDs
247
257
__ONEMATCH_MIN_ID = __MAX_ID;
248
258
__ONEMATCH_MAX_ID = -1;
249
259
for (ID2IDSetCIter p=_pairs.begin(); p != _pairs.end(); ++p) {
250
260
if (p->first < __ONEMATCH_MIN_ID) __ONEMATCH_MIN_ID = p->first;
251
261
if (p->second > __ONEMATCH_MAX_ID) __ONEMATCH_MAX_ID = p->second;
253
printf( " new MIN,MAX (%d,%d)", __ONEMATCH_MIN_ID, __ONEMATCH_MAX_ID );
263
printf(" new MIN,MAX (%d,%d)", __ONEMATCH_MIN_ID, __ONEMATCH_MAX_ID);
261
271
// step down the children unless all paths are found
262
272
// if either ID is lower than highest ID in the set of ID-pairs
290
300
__ONEMATCH_MIN_ID = __MAX_ID;
291
301
__ONEMATCH_MAX_ID = __MIN_ID;
292
302
for (SpeciesID id1 = __MIN_ID; id1 <= __MAX_ID; ++id1) {
293
// printf( "[%6i] ",id1 );
294
303
for (SpeciesID id2 = __MIN_ID; id2 <= id1; ++id2) {
295
304
smaller_id = (id1 < id2) ? id1 : id2;
296
305
bigger_id = (id1 < id2) ? id2 : id1;
297
bit1 = __MAP->get( smaller_id, bigger_id );
298
bit2 = __MAP->get( bigger_id, smaller_id );
306
bit1 = __MAP->get(smaller_id, bigger_id);
307
bit2 = __MAP->get(bigger_id, smaller_id);
299
308
if (bit1 && bit2) {
303
oneMatch.insert( ID2IDPair(smaller_id,bigger_id) );
311
oneMatch.insert(ID2IDPair(smaller_id, bigger_id));
304
312
if (smaller_id < __ONEMATCH_MIN_ID) __ONEMATCH_MIN_ID = smaller_id;
305
313
if (bigger_id > __ONEMATCH_MAX_ID) __ONEMATCH_MAX_ID = bigger_id;
308
if (id1 != id2) noMatch.insert( ID2IDPair(smaller_id,bigger_id) ); // there are no probes to distinguish a species from itself .. obviously
316
if (id1 != id2) noMatch.insert(ID2IDPair(smaller_id, bigger_id)); // there are no probes to distinguish a species from itself .. obviously
313
printf( "(enter to continue)\n" );
320
printf("(enter to continue)\n");
316
printf( "\n\n----------------- no matches ---------------\n\n" );
322
printf("\n\n----------------- no matches ---------------\n\n");
317
323
if (!_result_filename) {
318
324
for (ID2IDSetCIter i = noMatch.begin(); i != noMatch.end(); ++i) {
319
printf( "%6i %6i\n", i->first, i->second );
325
printf("%6i %6i\n", i->first, i->second);
322
printf( "%zu no matches\n(enter to continue)\n", noMatch.size() );
328
printf("%zu no matches\n(enter to continue)\n", noMatch.size());
325
printf( "\n\n----------------- one match ---------------\n\n" );
330
printf("\n\n----------------- one match ---------------\n\n");
326
331
if (!_result_filename) {
327
332
for (ID2IDSetCIter i = oneMatch.begin(); i != oneMatch.end(); ++i) {
328
printf( "%6i %6i\n", i->first, i->second );
333
printf("%6i %6i\n", i->first, i->second);
331
printf( "%zu one matches\n(enter to continue)\n", oneMatch.size() );
336
printf("%zu one matches\n(enter to continue)\n", oneMatch.size());
334
338
// find paths for pairs
348
352
for (IDID2IDSetMapCIter i = __PAIR2PATH->begin();
349
353
i != __PAIR2PATH->end();
351
printf( "\nPair (%i,%i) Setsize (%zu)", i->first.first, i->first.second, i->second.size() );
355
printf("\nPair (%i,%i) Setsize (%zu)", i->first.first, i->first.second, i->second.size());
352
356
PS_NodePtr current_node = _root_node;
354
358
for (IDSetCIter path_id=i->second.begin();
355
path_id !=i->second.end();
357
current_node = current_node->getChild( *path_id ).second;
358
if (c2 % 10 == 0) printf( "\n" );
359
printf( "%6i%s ", *path_id, (current_node->hasProbes()) ? ((current_node->hasInverseProbes()) ? "*" : "+") : "-" );
359
path_id != i->second.end();
361
current_node = current_node->getChild(*path_id).second;
362
if (c2 % 10 == 0) printf("\n");
363
printf("%6i%s ", *path_id, (current_node->hasProbes()) ? ((current_node->hasInverseProbes()) ? "*" : "+") : "-");
361
printf( "\nFinal Node : %p ", &(*current_node) );
365
printf("\nFinal Node : %p ", &(*current_node));
362
366
current_node->printOnlyMe();
365
printf( "\n%zu paths\n", __PAIR2PATH->size() );
369
printf("\n%zu paths\n", __PAIR2PATH->size());
369
373
if (!oneMatch.empty()) {
370
printf( "did not find a path for these :\n" );
374
printf("did not find a path for these :\n");
371
375
for (ID2IDSetCIter i = oneMatch.begin(); i != oneMatch.end(); ++i) {
372
printf( "%6i %6i\n", i->first, i->second );
376
printf("%6i %6i\n", i->first, i->second);
376
380
// make preset map
378
PS_BitMap_Counted *preset = new PS_BitMap_Counted( false, __MAX_ID+1 );
382
PS_BitMap_Counted *preset = new PS_BitMap_Counted(false, __MAX_ID+1);
379
383
// set bits for no matches
380
384
for (ID2IDSetCIter pair=noMatch.begin(); pair != noMatch.end(); ++pair) {
381
preset->setTrue( pair->second, pair->first );
385
preset->setTrue(pair->second, pair->first);
383
387
// iterate over paths
384
388
for (IDID2IDSetMapCIter i = __PAIR2PATH->begin();
412
417
if (_result_filename) {
413
PS_FileBuffer *result_file = new PS_FileBuffer( _result_filename, PS_FileBuffer::WRITEONLY );
418
PS_FileBuffer *result_file = new PS_FileBuffer(_result_filename, PS_FileBuffer::WRITEONLY);
415
printf( "saving no matches to %s...\n", _result_filename );
416
result_file->put_long( noMatch.size() );
420
printf("saving no matches to %s...\n", _result_filename);
421
result_file->put_long(noMatch.size());
417
422
for (ID2IDSetCIter i = noMatch.begin(); i != noMatch.end(); ++i) {
418
result_file->put_int( i->first );
419
result_file->put_int( i->second );
423
result_file->put_int(i->first);
424
result_file->put_int(i->second);
422
printf( "saving one matches to %s...\n", _result_filename );
423
result_file->put_long( __PAIR2PATH->size() );
427
printf("saving one matches to %s...\n", _result_filename);
428
result_file->put_long(__PAIR2PATH->size());
424
429
for (IDID2IDSetMapCIter i = __PAIR2PATH->begin(); i != __PAIR2PATH->end(); ++i) {
425
result_file->put_int( i->first.first );
426
result_file->put_int( i->first.second );
427
result_file->put_long( i->second.size() );
428
for (IDSetCIter path_id=i->second.begin(); path_id !=i->second.end(); ++path_id) {
429
result_file->put_int( *path_id );
430
result_file->put_int(i->first.first);
431
result_file->put_int(i->first.second);
432
result_file->put_long(i->second.size());
433
for (IDSetCIter path_id=i->second.begin(); path_id != i->second.end(); ++path_id) {
434
result_file->put_int(*path_id);
433
printf( "saving preset bitmap to %s...\n", _result_filename );
434
preset->save( result_file );
438
printf("saving preset bitmap to %s...\n", _result_filename);
439
preset->save(result_file);
435
440
delete result_file;
442
447
// ====================================================
443
448
// ====================================================
445
int main( int argc, char *argv[] ) {
450
int main(int argc, char *argv[]) {
447
452
// open probe-set-database
449
printf( "Missing argument\n Usage %s <database name> [[-]bitmap filename] [+result filename]\n ", argv[0] );
450
printf( "if bitmap filename begins with '-' it is loaded, else its created\n " );
451
printf( "result filename MUST be preceded by '+'\n" );
454
printf("Missing argument\n Usage %s <database name> [[-]bitmap filename] [+result filename]\n ", argv[0]);
455
printf("if bitmap filename begins with '-' it is loaded, else its created\n ");
456
printf("result filename MUST be preceded by '+'\n");
455
460
const char *input_DB_name = argv[1];
460
465
if (argv[2][0] == '+') {
461
466
result_filename = argv[2]+1;
463
469
bitmap_filename = argv[2];
467
473
if (argv[3][0] == '+') {
468
474
result_filename = argv[3]+1;
470
477
bitmap_filename = argv[3];
474
481
struct tms before;
476
printf( "Opening probe-set-database '%s'..\n", input_DB_name );
477
PS_Database *db = new PS_Database( input_DB_name, PS_Database::READONLY );
483
printf("Opening probe-set-database '%s'..\n", input_DB_name);
484
PS_Database *db = new PS_Database(input_DB_name, PS_Database::READONLY);
479
486
__MAX_ID = db->getMaxID();
480
487
__MIN_ID = db->getMinID();
481
PS_print_time_diff( &before, "(enter to continue) " );
488
PS_print_time_diff(&before, "(enter to continue) ");
484
__MAP = new PS_BitMap_Fast( false, __MAX_ID+1 );
490
__MAP = new PS_BitMap_Fast(false, __MAX_ID+1);
485
491
if (!bitmap_filename || (bitmap_filename[0] != '-')) {
486
printf( "detecting..\n" ); fflush( stdout );
487
__NODES_LEFT = (char*)malloc( 61 );
488
memset( __NODES_LEFT, ' ', 60 );
489
__NODES_LEFT[ 60 ] = '\x00';
490
PS_detect_weak_differences( db->getRootNode() );
491
free( __NODES_LEFT );
492
printf("detecting..\n"); fflush(stdout);
493
__NODES_LEFT = (char*)malloc(61);
494
memset(__NODES_LEFT, ' ', 60);
495
__NODES_LEFT[60] = '\x00';
496
PS_detect_weak_differences(db->getRootNode());
492
498
if (bitmap_filename) {
493
printf( "saving bitmap to file %s\n", bitmap_filename );
494
PS_FileBuffer *mapfile = new PS_FileBuffer( bitmap_filename, PS_FileBuffer::WRITEONLY );
495
__MAP->save( mapfile );
499
printf("saving bitmap to file %s\n", bitmap_filename);
500
PS_FileBuffer *mapfile = new PS_FileBuffer(bitmap_filename, PS_FileBuffer::WRITEONLY);
501
__MAP->save(mapfile);
498
} else if (bitmap_filename) {
499
printf( "loading bitmap from file %s\n",bitmap_filename+1 );
500
PS_FileBuffer *mapfile = new PS_FileBuffer( bitmap_filename+1, PS_FileBuffer::READONLY );
501
__MAP->load( mapfile );
505
else if (bitmap_filename) {
506
printf("loading bitmap from file %s\n", bitmap_filename+1);
507
PS_FileBuffer *mapfile = new PS_FileBuffer(bitmap_filename+1, PS_FileBuffer::READONLY);
508
__MAP->load(mapfile);
504
printf( "(enter to continue)\n" );
511
printf("(enter to continue)\n");
508
PS_print_and_evaluate_map( db->getRootNode(), result_filename );
509
PS_print_time_diff( &before, "(enter to continue) " );
514
PS_print_and_evaluate_map(db->getRootNode(), result_filename);
515
PS_print_time_diff(&before, "(enter to continue) ");
513
printf( "removing database from memory\n" );
518
printf("removing database from memory\n");
515
printf( "(enter to continue)\n" );
520
printf("(enter to continue)\n");