00001 #ifndef BMRANDOM__H__INCLUDED__
00002 #define BMRANDOM__H__INCLUDED__
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029 #include "bm.h"
00030 #include "bmfunc.h"
00031 #include "bmdef.h"
00032
00033 #include <stdlib.h>
00034 #include <algorithm>
00035
00036
00037 namespace bm
00038 {
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052 template<class BV>
00053 class random_subset
00054 {
00055 public:
00056 random_subset();
00057 ~random_subset();
00058
00059
00060
00061
00062
00063
00064
00065 void sample(BV& bv_out, const BV& bv_in, unsigned count);
00066
00067
00068 private:
00069 typedef
00070 typename BV::blocks_manager_type blocks_manager_type;
00071
00072 private:
00073 void get_subset(BV& bv_out,
00074 const BV& bv_in,
00075 unsigned bv_in_count,
00076 unsigned count);
00077
00078 unsigned find_max_block();
00079
00080 void get_random_subset(bm::word_t* blk_out,
00081 const bm::word_t* blk_src,
00082 unsigned count);
00083 static
00084 unsigned process_word(bm::word_t* blk_out,
00085 const bm::word_t* blk_src,
00086 unsigned i,
00087 unsigned count);
00088
00089 static
00090 void get_random_array(bm::word_t* blk_out,
00091 bm::gap_word_t* bit_list,
00092 unsigned bit_list_size,
00093 unsigned count);
00094
00095
00096 private:
00097 random_subset(const random_subset&);
00098 random_subset& operator=(const random_subset&);
00099 private:
00100 unsigned* block_counts_;
00101 unsigned short* block_bits_take_;
00102 unsigned blocks_;
00103 bm::gap_word_t bit_list_[bm::gap_max_bits];
00104 unsigned* block_candidates_;
00105 unsigned candidates_count_;
00106 bm::word_t* sub_block_;
00107 };
00108
00109
00110
00111
00112
00113
00114 template<class BV>
00115 random_subset<BV>::random_subset()
00116 : block_counts_(new unsigned[bm::set_total_blocks]),
00117 block_bits_take_(new bm::gap_word_t[bm::set_total_blocks]),
00118 block_candidates_(new unsigned[bm::set_total_blocks]),
00119 candidates_count_(0),
00120 sub_block_(new bm::word_t[bm::set_block_size])
00121 {
00122 }
00123
00124 template<class BV>
00125 random_subset<BV>::~random_subset()
00126 {
00127 delete [] block_counts_;
00128 delete [] block_bits_take_;
00129 delete [] block_candidates_;
00130 delete [] sub_block_;
00131 }
00132
00133 template<class BV>
00134 void random_subset<BV>::sample(BV& bv_out,
00135 const BV& bv_in,
00136 unsigned count)
00137 {
00138 if (count == 0)
00139 {
00140 bv_out.clear(true);
00141 return;
00142 }
00143
00144 unsigned bcnt = bv_in.count();
00145 if (count >= bcnt)
00146 {
00147 bv_out = bv_in;
00148 return;
00149 }
00150 if (count > bcnt/2)
00151 {
00152
00153 BV tmp_bv;
00154 unsigned delta_count = bcnt - count;
00155
00156 get_subset(tmp_bv, bv_in, bcnt, delta_count);
00157 bv_out = bv_in;
00158 bv_out -= tmp_bv;
00159 return;
00160 }
00161
00162 get_subset(bv_out, bv_in, bcnt, count);
00163 }
00164
00165 template<class BV>
00166 void random_subset<BV>::get_subset(BV& bv_out,
00167 const BV& bv_in,
00168 unsigned bcnt,
00169 unsigned count)
00170 {
00171 bv_out.clear(true);
00172 bv_out.resize(bv_in.size());
00173
00174 const blocks_manager_type& bman_in = bv_in.get_blocks_manager();
00175 blocks_manager_type& bman_out = bv_out.get_blocks_manager();
00176
00177 bm::word_t* tmp_block = bman_out.check_allocate_tempblock();
00178 candidates_count_ = 0;
00179
00180
00181
00182
00183 unsigned block_count = blocks_ = bv_in.count_blocks(block_counts_);
00184 for (unsigned i = 0; i <= block_count; ++i)
00185 {
00186 if (block_counts_[i])
00187 {
00188 float block_percent =
00189 ((float)(block_counts_[i]+1)) / (float)bcnt;
00190 float bits_to_take = ((float)count) * block_percent;
00191 bits_to_take += 0.99f;
00192
00193 unsigned t = (unsigned)bits_to_take;
00194 block_bits_take_[i] = (bm::gap_word_t)t;
00195
00196 if (block_bits_take_[i] == 0)
00197 {
00198 block_bits_take_[i] = 1;
00199 }
00200 else
00201 if (block_bits_take_[i] > block_counts_[i])
00202 block_bits_take_[i] = block_counts_[i];
00203
00204 BM_ASSERT(bman_in.get_block(i));
00205 }
00206 else
00207 {
00208 block_bits_take_[i] = 0;
00209 }
00210 }
00211
00212 for (unsigned take_count = 0; count; count -= take_count)
00213 {
00214 unsigned i = find_max_block();
00215 take_count = block_bits_take_[i];
00216 if (take_count > count)
00217 take_count = count;
00218 if (take_count == 0)
00219 continue;
00220 const bm::word_t* blk_src = bman_in.get_block(i);
00221 BM_ASSERT(blk_src);
00222
00223
00224 bm::word_t* blk_out = bman_out.get_block(i);
00225 if (blk_out != 0)
00226 {
00227 blk_out = bman_out.deoptimize_block(i);
00228 }
00229 else
00230 {
00231 blk_out = bman_out.get_allocator().alloc_bit_block();
00232 bman_out.set_block(i, blk_out);
00233 }
00234 if (take_count == block_counts_[i])
00235 {
00236
00237 if (BM_IS_GAP(bman_in, blk_src, i))
00238 {
00239 gap_convert_to_bitset(blk_out, BMGAP_PTR(blk_src));
00240 }
00241 else
00242 {
00243 bm::bit_block_copy(blk_out, blk_src);
00244 }
00245 block_bits_take_[i] = 0;
00246 continue;
00247 }
00248 bit_block_set(blk_out, 0);
00249
00250 if (block_counts_[i] < 4096)
00251 {
00252 unsigned arr_len;
00253
00254 if (BM_IS_GAP(bman_in, blk_src, i))
00255 {
00256 arr_len = gap_convert_to_arr(bit_list_,
00257 BMGAP_PTR(blk_src),
00258 bm::gap_max_bits);
00259 }
00260 else
00261 {
00262 arr_len = bit_convert_to_arr(bit_list_,
00263 blk_src,
00264 bm::gap_max_bits,
00265 bm::gap_max_bits,
00266 0);
00267 }
00268 BM_ASSERT(arr_len);
00269 get_random_array(blk_out, bit_list_, arr_len, take_count);
00270 }
00271 else
00272 {
00273
00274 if (BM_IS_GAP(bman_in, blk_src, i))
00275 {
00276 gap_convert_to_bitset(tmp_block, BMGAP_PTR(blk_src));
00277 blk_src = tmp_block;
00278 }
00279
00280
00281 get_random_subset(blk_out, blk_src, take_count);
00282 }
00283
00284 block_bits_take_[i] = 0;
00285 }
00286 }
00287
00288 template<class BV>
00289 void random_subset<BV>::get_random_subset(bm::word_t* blk_out,
00290 const bm::word_t* blk_src,
00291 unsigned take_count)
00292 {
00293 for (unsigned rounds = 0; take_count && rounds < 10; ++rounds)
00294 {
00295
00296
00297 unsigned i = rand() % bm::set_block_size;
00298 unsigned new_count;
00299
00300 for (; i < bm::set_block_size && take_count; ++i)
00301 {
00302 if (blk_src[i] && (blk_out[i] != blk_src[i]))
00303 {
00304 take_count -= new_count =
00305 process_word(blk_out, blk_src, i, take_count);
00306 }
00307 }
00308
00309 }
00310
00311
00312 if (take_count)
00313 {
00314
00315 for (unsigned i = 0; i < bm::set_block_size; ++i)
00316 {
00317 sub_block_[i] = blk_src[i] & ~blk_out[i];
00318 }
00319
00320
00321 unsigned arr_len = bit_convert_to_arr(bit_list_,
00322 sub_block_,
00323 bm::gap_max_bits,
00324 bm::gap_max_bits,
00325 0);
00326 BM_ASSERT(arr_len);
00327 get_random_array(blk_out, bit_list_, arr_len, take_count);
00328 }
00329 }
00330
00331 template<class BV>
00332 unsigned random_subset<BV>::process_word(bm::word_t* blk_out,
00333 const bm::word_t* blk_src,
00334 unsigned i,
00335 unsigned count)
00336 {
00337 unsigned new_bits, mask;
00338
00339 do
00340 {
00341 mask = rand();
00342 mask ^= mask << 16;
00343 } while (mask == 0);
00344
00345 unsigned src_rand = blk_src[i] & mask;
00346 new_bits = src_rand & ~blk_out[i];
00347
00348 if (new_bits)
00349 {
00350 unsigned new_count = bm::word_bitcount(new_bits);
00351
00352
00353 if (new_count > count)
00354 {
00355 BM_ASSERT(count);
00356
00357 unsigned char blist[64];
00358 unsigned arr_size = bm::bit_list_4(new_bits, blist);
00359 BM_ASSERT(arr_size == new_count);
00360 std::random_shuffle(blist, blist + arr_size);
00361 unsigned value = 0;
00362 for (unsigned j = 0; j < count; ++j)
00363 {
00364 value |= (1 << blist[j]);
00365 }
00366 new_bits = value;
00367 new_count = count;
00368
00369 BM_ASSERT(bm::word_bitcount(new_bits) == count);
00370 BM_ASSERT((new_bits & ~blk_src[i]) == 0);
00371 }
00372
00373 blk_out[i] |= new_bits;
00374 return new_count;
00375 }
00376
00377 return 0;
00378 }
00379
00380
00381 template<class BV>
00382 void random_subset<BV>::get_random_array(bm::word_t* blk_out,
00383 bm::gap_word_t* bit_list,
00384 unsigned bit_list_size,
00385 unsigned count)
00386 {
00387 std::random_shuffle(bit_list, bit_list + bit_list_size);
00388 for (unsigned i = 0; i < count; ++i)
00389 {
00390 bm::set_bit(blk_out, bit_list[i]);
00391 }
00392 }
00393
00394 template<class BV>
00395 unsigned random_subset<BV>::find_max_block()
00396 {
00397 if (candidates_count_)
00398 {
00399 return block_candidates_[--candidates_count_];
00400 }
00401
00402 unsigned candidate = 0;
00403 unsigned max_i = 0;
00404 for (unsigned i = 0; i <= blocks_; ++i)
00405 {
00406 if (block_bits_take_[i] == 0) continue;
00407 if (block_bits_take_[i] == candidate)
00408 {
00409 block_candidates_[candidates_count_++] = i;
00410 }
00411 else
00412 {
00413 unsigned diff = abs((int)block_bits_take_[i] - (int)candidate);
00414 double d = (double)diff / (double)candidate;
00415
00416 if (d < 0.20f)
00417 {
00418 block_candidates_[candidates_count_++] = i;
00419 }
00420 else
00421 if (block_bits_take_[i] > candidate)
00422 {
00423 candidate = block_bits_take_[i];
00424 max_i = i;
00425 candidates_count_ = 0;
00426 block_candidates_[candidates_count_++] = i;
00427 }
00428 }
00429 }
00430
00431 if (candidates_count_ > 1)
00432 {
00433 std::random_shuffle(block_candidates_, block_candidates_ + candidates_count_);
00434 return find_max_block();
00435 }
00436 return max_i;
00437 }
00438
00439
00440 }
00441
00442 #include "bmundef.h"
00443
00444 #endif