1
#ifndef BMRANDOM__H__INCLUDED__
2
#define BMRANDOM__H__INCLUDED__
4
Copyright(c) 2009 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
6
Permission is hereby granted, free of charge, to any person
7
obtaining a copy of this software and associated documentation
8
files (the "Software"), to deal in the Software without restriction,
9
including without limitation the rights to use, copy, modify, merge,
10
publish, distribute, sublicense, and/or sell copies of the Software,
11
and to permit persons to whom the Software is furnished to do so,
12
subject to the following conditions:
14
The above copyright notice and this permission notice shall be included
15
in all copies or substantial portions of the Software.
17
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
20
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
21
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23
OTHER DEALINGS IN THE SOFTWARE.
25
For more information please visit: http://bmagic.sourceforge.net
41
Class implements algorithm for random subset generation.
43
Implemented method tries to be fair, but doesn't guarantee
47
Class holds temporary buffers and variables, so it is recommended to
48
re-use instances over multiple calls.
59
/// Get random subset of input vector
61
/// @param bv_out - destination vector
62
/// @param bv_in - input vector
63
/// @param count - number of bits to pick
65
void sample(BV& bv_out, const BV& bv_in, unsigned count);
70
typename BV::blocks_manager_type blocks_manager_type;
73
void get_subset(BV& bv_out,
78
unsigned find_max_block();
80
void get_random_subset(bm::word_t* blk_out,
81
const bm::word_t* blk_src,
84
unsigned process_word(bm::word_t* blk_out,
85
const bm::word_t* blk_src,
90
void get_random_array(bm::word_t* blk_out,
91
bm::gap_word_t* bit_list,
92
unsigned bit_list_size,
97
random_subset(const random_subset&);
98
random_subset& operator=(const random_subset&);
100
unsigned* block_counts_;
101
unsigned short* block_bits_take_;
103
bm::gap_word_t bit_list_[bm::gap_max_bits];
104
unsigned* block_candidates_;
105
unsigned candidates_count_;
106
bm::word_t* sub_block_;
110
///////////////////////////////////////////////////////////////////
115
random_subset<BV>::random_subset()
116
: block_counts_(new unsigned[bm::set_total_blocks]),
117
block_bits_take_(new bm::gap_word_t[bm::set_total_blocks]),
118
block_candidates_(new unsigned[bm::set_total_blocks]),
119
candidates_count_(0),
120
sub_block_(new bm::word_t[bm::set_block_size])
125
random_subset<BV>::~random_subset()
127
delete [] block_counts_;
128
delete [] block_bits_take_;
129
delete [] block_candidates_;
130
delete [] sub_block_;
134
void random_subset<BV>::sample(BV& bv_out,
144
unsigned bcnt = bv_in.count();
152
// build the complement vector and subtract it
154
unsigned delta_count = bcnt - count;
156
get_subset(tmp_bv, bv_in, bcnt, delta_count);
162
get_subset(bv_out, bv_in, bcnt, count);
166
void random_subset<BV>::get_subset(BV& bv_out,
172
bv_out.resize(bv_in.size());
174
const blocks_manager_type& bman_in = bv_in.get_blocks_manager();
175
blocks_manager_type& bman_out = bv_out.get_blocks_manager();
177
bm::word_t* tmp_block = bman_out.check_allocate_tempblock();
178
candidates_count_ = 0;
181
// compute bit-counts in all blocks
183
unsigned block_count = blocks_ = bv_in.count_blocks(block_counts_);
184
for (unsigned i = 0; i <= block_count; ++i)
186
if (block_counts_[i])
188
float block_percent =
189
((float)(block_counts_[i]+1)) / (float)bcnt;
190
float bits_to_take = ((float)count) * block_percent;
191
bits_to_take += 0.99f;
193
unsigned t = (unsigned)bits_to_take;
194
block_bits_take_[i] = (bm::gap_word_t)t;
196
if (block_bits_take_[i] == 0)
198
block_bits_take_[i] = 1;
201
if (block_bits_take_[i] > block_counts_[i])
202
block_bits_take_[i] = block_counts_[i];
204
BM_ASSERT(bman_in.get_block(i));
208
block_bits_take_[i] = 0;
212
for (unsigned take_count = 0; count; count -= take_count)
214
unsigned i = find_max_block();
215
take_count = block_bits_take_[i];
216
if (take_count > count)
220
const bm::word_t* blk_src = bman_in.get_block(i);
223
// allocate target block
224
bm::word_t* blk_out = bman_out.get_block(i);
227
blk_out = bman_out.deoptimize_block(i);
231
blk_out = bman_out.get_allocator().alloc_bit_block();
232
bman_out.set_block(i, blk_out);
234
if (take_count == block_counts_[i])
236
// copy the whole src block
237
if (BM_IS_GAP(bman_in, blk_src, i))
239
gap_convert_to_bitset(blk_out, BMGAP_PTR(blk_src));
243
bm::bit_block_copy(blk_out, blk_src);
245
block_bits_take_[i] = 0; // exclude block from any farther picking
248
bit_block_set(blk_out, 0);
250
if (block_counts_[i] < 4096) // use array shuffle
253
// convert source block to bit-block
254
if (BM_IS_GAP(bman_in, blk_src, i))
256
arr_len = gap_convert_to_arr(bit_list_,
262
arr_len = bit_convert_to_arr(bit_list_,
269
get_random_array(blk_out, bit_list_, arr_len, take_count);
273
// convert source block to bit-block
274
if (BM_IS_GAP(bman_in, blk_src, i))
276
gap_convert_to_bitset(tmp_block, BMGAP_PTR(blk_src));
280
// pick random bits source block to target
281
get_random_subset(blk_out, blk_src, take_count);
284
block_bits_take_[i] = 0; // exclude block from any farther picking
289
void random_subset<BV>::get_random_subset(bm::word_t* blk_out,
290
const bm::word_t* blk_src,
293
for (unsigned rounds = 0; take_count && rounds < 10; ++rounds)
295
// pick random scan start and scan direction
297
unsigned i = rand() % bm::set_block_size;
300
for (; i < bm::set_block_size && take_count; ++i)
302
if (blk_src[i] && (blk_out[i] != blk_src[i]))
304
take_count -= new_count =
305
process_word(blk_out, blk_src, i, take_count);
310
// if masked scan did not produce enough results..
314
// Find all vacant bits: do logical (src SUB out)
315
for (unsigned i = 0; i < bm::set_block_size; ++i)
317
sub_block_[i] = blk_src[i] & ~blk_out[i];
319
// now transform vacant bits to array, then pick random elements
321
unsigned arr_len = bit_convert_to_arr(bit_list_,
327
get_random_array(blk_out, bit_list_, arr_len, take_count);
332
unsigned random_subset<BV>::process_word(bm::word_t* blk_out,
333
const bm::word_t* blk_src,
337
unsigned new_bits, mask;
345
unsigned src_rand = blk_src[i] & mask;
346
new_bits = src_rand & ~blk_out[i];
350
unsigned new_count = bm::word_bitcount(new_bits);
352
// check if we accidentally picked more bits than needed
353
if (new_count > count)
357
unsigned char blist[64];
358
unsigned arr_size = bm::bit_list_4(new_bits, blist);
359
BM_ASSERT(arr_size == new_count);
360
std::random_shuffle(blist, blist + arr_size);
362
for (unsigned j = 0; j < count; ++j)
364
value |= (1 << blist[j]);
369
BM_ASSERT(bm::word_bitcount(new_bits) == count);
370
BM_ASSERT((new_bits & ~blk_src[i]) == 0);
373
blk_out[i] |= new_bits;
382
void random_subset<BV>::get_random_array(bm::word_t* blk_out,
383
bm::gap_word_t* bit_list,
384
unsigned bit_list_size,
387
std::random_shuffle(bit_list, bit_list + bit_list_size);
388
for (unsigned i = 0; i < count; ++i)
390
bm::set_bit(blk_out, bit_list[i]);
395
unsigned random_subset<BV>::find_max_block()
397
if (candidates_count_)
399
return block_candidates_[--candidates_count_];
402
unsigned candidate = 0;
404
for (unsigned i = 0; i <= blocks_; ++i)
406
if (block_bits_take_[i] == 0) continue;
407
if (block_bits_take_[i] == candidate)
409
block_candidates_[candidates_count_++] = i;
413
unsigned diff = abs((int)block_bits_take_[i] - (int)candidate);
414
double d = (double)diff / (double)candidate;
416
if (d < 0.20f) // delta is statistically insignificant
418
block_candidates_[candidates_count_++] = i;
421
if (block_bits_take_[i] > candidate)
423
candidate = block_bits_take_[i];
425
candidates_count_ = 0;
426
block_candidates_[candidates_count_++] = i;
431
if (candidates_count_ > 1)
433
std::random_shuffle(block_candidates_, block_candidates_ + candidates_count_);
434
return find_max_block();