2
Copyright(c) 2002-2005 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
4
Permission is hereby granted, free of charge, to any person
5
obtaining a copy of this software and associated documentation
6
files (the "Software"), to deal in the Software without restriction,
7
including without limitation the rights to use, copy, modify, merge,
8
publish, distribute, sublicense, and/or sell copies of the Software,
9
and to permit persons to whom the Software is furnished to do so,
10
subject to the following conditions:
12
The above copyright notice and this permission notice shall be included
13
in all copies or substantial portions of the Software.
15
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
19
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21
OTHER DEALINGS IN THE SOFTWARE.
23
For more information please visit: http://bmagic.sourceforge.net
27
#ifndef BMALGO__H__INCLUDED__
28
#define BMALGO__H__INCLUDED__
37
/*! \defgroup setalgo Set algorithms
42
/*! \defgroup distance Distance metrics
43
* Algorithms to compute binary distance metrics
49
\brief Distance metrics codes defined for vectors A and B
54
COUNT_AND, //!< (A & B).count()
55
COUNT_XOR, //!< (A ^ B).count()
56
COUNT_OR, //!< (A | B).count()
57
COUNT_SUB_AB, //!< (A - B).count()
58
COUNT_SUB_BA, //!< (B - A).count()
59
COUNT_A, //!< A.count()
60
COUNT_B //!< B.count()
64
\brief Distance metric descriptor, holds metric code and result.
65
\sa distance_operation
68
struct distance_metric_descriptor
70
distance_metric metric;
73
distance_metric_descriptor(distance_metric m)
77
distance_metric_descriptor()
78
: metric(bm::COUNT_XOR),
83
\brief Sets metric result to 0
93
\brief Internal function computes different distance metrics.
99
void combine_count_operation_with_block(const bm::word_t* blk,
101
const bm::word_t* arg_blk,
103
bm::word_t* temp_blk,
104
distance_metric_descriptor* dmit,
105
distance_metric_descriptor* dmit_end)
110
gap_word_t* g1 = BMGAP_PTR(blk);
111
gap_word_t* g2 = BMGAP_PTR(arg_blk);
113
if (gap) // first block GAP-type
115
if (arg_gap) // both blocks GAP-type
117
gap_word_t tmp_buf[bm::gap_max_buff_len * 3]; // temporary result
119
for (distance_metric_descriptor* it = dmit;it < dmit_end; ++it)
121
distance_metric_descriptor& dmd = *it;
126
res = gap_operation_and(g1, g2, tmp_buf);
129
res = gap_operation_or(g1, g2, tmp_buf);
131
case bm::COUNT_SUB_AB:
132
res = gap_operation_sub(g1, g2, tmp_buf);
134
case bm::COUNT_SUB_BA:
135
res = gap_operation_sub(g2, g1, tmp_buf);
138
res = gap_operation_xor(g1, g2, tmp_buf);
149
dmd.result += gap_bit_count(res);
156
else // first block - GAP, argument - BITSET
158
for (distance_metric_descriptor* it = dmit;it < dmit_end; ++it)
160
distance_metric_descriptor& dmd = *it;
166
dmd.result += gap_bitset_and_count(arg_blk, g1);
170
dmd.result += gap_bit_count(g1);
172
dmd.result += gap_bitset_or_count(arg_blk, g1);
174
case bm::COUNT_SUB_AB:
175
gap_convert_to_bitset((bm::word_t*) temp_blk, g1);
177
bit_operation_sub_count((bm::word_t*)temp_blk,
178
((bm::word_t*)temp_blk) + bm::set_block_size,
182
case bm::COUNT_SUB_BA:
183
dmd.metric = bm::COUNT_SUB_AB; // recursive call to SUB_AB
184
combine_count_operation_with_block(arg_blk,
190
dmd.metric = bm::COUNT_SUB_BA; // restore status quo
194
dmd.result += gap_bit_count(g1);
196
dmd.result += gap_bitset_xor_count(arg_blk, g1);
200
dmd.result += gap_bit_count(g1);
206
bit_block_calc_count(arg_blk,
207
arg_blk + bm::set_block_size);
218
else // first block is BITSET-type
220
if (arg_gap) // second argument block is GAP-type
222
for (distance_metric_descriptor* it = dmit;it < dmit_end; ++it)
224
distance_metric_descriptor& dmd = *it;
230
dmd.result += gap_bitset_and_count(blk, g2);
234
dmd.result += gap_bit_count(g2);
236
dmd.result += gap_bitset_or_count(blk, g2);
238
case bm::COUNT_SUB_AB:
240
dmd.result += gap_bitset_sub_count(blk, g2);
242
case bm::COUNT_SUB_BA:
243
dmd.metric = bm::COUNT_SUB_AB; // recursive call to SUB_AB
244
combine_count_operation_with_block(arg_blk,
250
dmd.metric = bm::COUNT_SUB_BA; // restore status quo
254
dmd.result += gap_bit_count(g2);
256
dmd.result += gap_bitset_xor_count(blk, g2);
262
bit_block_calc_count(blk,
263
blk + bm::set_block_size);
268
dmd.result += gap_bit_count(g2);
278
// --------------------------------------------
280
// Here we combine two plain bitblocks
282
const bm::word_t* blk_end;
283
const bm::word_t* arg_end;
285
blk_end = blk + (bm::set_block_size);
286
arg_end = arg_blk + (bm::set_block_size);
289
for (distance_metric_descriptor* it = dmit; it < dmit_end; ++it)
291
distance_metric_descriptor& dmd = *it;
297
bit_operation_and_count(blk, blk_end, arg_blk);
301
bit_operation_or_count(blk, blk_end, arg_blk);
303
case bm::COUNT_SUB_AB:
305
bit_operation_sub_count(blk, blk_end, arg_blk);
307
case bm::COUNT_SUB_BA:
309
bit_operation_sub_count(arg_blk, arg_end, blk);
313
bit_operation_xor_count(blk, blk_end, arg_blk);
317
dmd.result += bit_block_calc_count(blk, blk_end);
321
dmd.result += bit_block_calc_count(arg_blk, arg_end);
332
\brief Distance computing template function.
334
Function receives two bitvectors and an array of distance metrics
335
(metrics pipeline). Function computes all metrics saves result into
336
corresponding pipeline results (distance_metric_descriptor::result)
337
An important detail is that function reuses metric descriptors,
338
incrementing received values. It allows you to accumulate results
339
from different calls in the pipeline.
341
\param bv1 - argument bitvector 1 (A)
342
\param bv2 - argument bitvector 2 (B)
343
\param dmit - pointer to first element of metric descriptors array
344
Input-Output parameter, receives metric code as input,
345
computation is added to "result" field
346
\param dmit_end - pointer to (last+1) element of metric descriptors array
352
void distance_operation(const BV& bv1,
354
distance_metric_descriptor* dmit,
355
distance_metric_descriptor* dmit_end)
357
const typename BV::blocks_manager_type& bman1 = bv1.get_blocks_manager();
358
const typename BV::blocks_manager_type& bman2 = bv2.get_blocks_manager();
360
bm::word_t* temp_blk = 0;
363
for (distance_metric_descriptor* it = dmit; it < dmit_end; ++it)
365
if (it->metric == bm::COUNT_SUB_AB ||
366
it->metric == bm::COUNT_SUB_BA)
368
temp_blk = bv1.allocate_tempblock();
375
bm::word_t*** blk_root = bman1.get_rootblock();
376
unsigned block_idx = 0;
379
const bm::word_t* blk;
380
const bm::word_t* arg_blk;
386
for (i = 0; i < bman1.top_block_size(); ++i)
388
bm::word_t** blk_blk = blk_root[i];
390
if (blk_blk == 0) // not allocated
392
const bm::word_t* const* bvbb = bman2.get_topblock(i);
395
block_idx += bm::set_array_size;
402
for (j = 0; j < bm::set_array_size; ++j,++block_idx)
404
arg_blk = bman2.get_block(i, j);
405
arg_gap = BM_IS_GAP(bman2, arg_blk, block_idx);
409
combine_count_operation_with_block(blk, blk_gap,
417
for (j = 0; j < bm::set_array_size; ++j, ++block_idx)
420
arg_blk = bman2.get_block(i, j);
422
if (blk == 0 && arg_blk == 0)
425
arg_gap = BM_IS_GAP(bman2, arg_blk, block_idx);
426
blk_gap = BM_IS_GAP(bman1, blk, block_idx);
428
combine_count_operation_with_block(blk, blk_gap,
440
bv1.free_tempblock(temp_blk);
448
\brief Computes bitcount of AND operation of two bitsets
449
\param bv1 - Argument bit-vector.
450
\param bv2 - Argument bit-vector.
451
\return bitcount of the result
455
bm::id_t count_and(const BV& bv1, const BV& bv2)
457
distance_metric_descriptor dmd(bm::COUNT_AND);
459
distance_operation(bv1, bv2, &dmd, &dmd+1);
465
\brief Computes bitcount of XOR operation of two bitsets
466
\param bv1 - Argument bit-vector.
467
\param bv2 - Argument bit-vector.
468
\return bitcount of the result
472
bm::id_t count_xor(const BV& bv1, const BV& bv2)
474
distance_metric_descriptor dmd(bm::COUNT_XOR);
476
distance_operation(bv1, bv2, &dmd, &dmd+1);
482
\brief Computes bitcount of SUB operation of two bitsets
483
\param bv1 - Argument bit-vector.
484
\param bv2 - Argument bit-vector.
485
\return bitcount of the result
489
bm::id_t count_sub(const BV& bv1, const BV& bv2)
491
distance_metric_descriptor dmd(bm::COUNT_SUB_AB);
493
distance_operation(bv1, bv2, &dmd, &dmd+1);
499
\brief Computes bitcount of OR operation of two bitsets
500
\param bv1 - Argument bit-vector.
501
\param bv2 - Argument bit-vector.
502
\return bitcount of the result
506
bm::id_t count_or(const BV& bv1, const BV& bv2)
508
distance_metric_descriptor dmd(bm::COUNT_OR);
510
distance_operation(bv1, bv2, &dmd, &dmd+1);
516
\brief Internal algorithms scans the input for the block range limit
520
It block_range_scan(It first, It last, unsigned nblock, unsigned* max_id)
523
for (right = first; right != last; ++right)
525
unsigned v = unsigned(*right);
526
BM_ASSERT(v < bm::id_max);
529
unsigned nb = v >> bm::set_block_shift;
537
\brief OR Combine bitvector and the iterable sequence
539
Algorithm combines bvector with sequence of integers
540
(represents another set). When the agrument set is sorted
541
this method can give serious increase in performance.
543
\param bv - destination bitvector
544
\param first - first element of the iterated sequence
545
\param last - last element of the iterated sequence
549
template<class BV, class It>
550
void combine_or(BV& bv, It first, It last)
552
typename BV::blocks_manager_type& bman = bv.get_blocks_manager();
553
unsigned max_id = bv.size();
557
unsigned nblock = unsigned((*first) >> bm::set_block_shift);
558
It right = block_range_scan(first, last, nblock, &max_id);
560
if (max_id >= bv.size())
561
bv.resize(max_id + 1);
563
// now we have one in-block array of bits to set
569
bman.check_allocate_block(nblock,
571
bv.get_new_blocks_strat(),
576
if (block_type == 1) // gap
578
bm::gap_word_t* gap_blk = BMGAP_PTR(blk);
579
unsigned threshold = bm::gap_limit(gap_blk, bman.glen());
581
for (; first < right; ++first)
584
unsigned nbit = (*first) & bm::set_block_mask;
586
unsigned new_block_len =
587
gap_set_value(true, gap_blk, nbit, &is_set);
588
if (new_block_len > threshold)
590
bman.extend_gap_block(nblock, gap_blk);
592
goto label1; // block pointer changed - goto reset
598
for (; first < right; ++first)
600
unsigned nbit = unsigned(*first & bm::set_block_mask);
601
unsigned nword = unsigned(nbit >> bm::set_word_shift);
602
nbit &= bm::set_word_mask;
603
blk[nword] |= (bm::word_t)1 << nbit;
613
\brief XOR Combine bitvector and the iterable sequence
615
Algorithm combines bvector with sequence of integers
616
(represents another set). When the agrument set is sorted
617
this method can give serious increase in performance.
619
\param bv - destination bitvector
620
\param first - first element of the iterated sequence
621
\param last - last element of the iterated sequence
625
template<class BV, class It>
626
void combine_xor(BV& bv, It first, It last)
628
typename BV::blocks_manager_type& bman = bv.get_blocks_manager();
629
unsigned max_id = bv.size();
633
unsigned nblock = unsigned((*first) >> bm::set_block_shift);
634
It right = block_range_scan(first, last, nblock, &max_id);
636
if (max_id >= bv.size())
637
bv.resize(max_id + 1);
639
// now we have one in-block array of bits to set
645
bman.check_allocate_block(nblock,
647
bv.get_new_blocks_strat(),
649
false /* no null return */);
652
if (block_type == 1) // gap
654
bm::gap_word_t* gap_blk = BMGAP_PTR(blk);
655
unsigned threshold = bm::gap_limit(gap_blk, bman.glen());
657
for (; first < right; ++first)
660
unsigned nbit = (*first) & bm::set_block_mask;
662
is_set = gap_test(gap_blk, nbit);
663
BM_ASSERT(is_set <= 1);
666
unsigned new_block_len =
667
gap_set_value(is_set, gap_blk, nbit, &is_set);
668
if (new_block_len > threshold)
670
bman.extend_gap_block(nblock, gap_blk);
672
goto label1; // block pointer changed - goto reset
678
for (; first < right; ++first)
680
unsigned nbit = unsigned(*first & bm::set_block_mask);
681
unsigned nword = unsigned(nbit >> bm::set_word_shift);
682
nbit &= bm::set_word_mask;
683
blk[nword] ^= (bm::word_t)1 << nbit;
694
\brief SUB Combine bitvector and the iterable sequence
696
Algorithm combines bvector with sequence of integers
697
(represents another set). When the agrument set is sorted
698
this method can give serious increase in performance.
700
\param bv - destination bitvector
701
\param first - first element of the iterated sequence
702
\param last - last element of the iterated sequence
706
template<class BV, class It>
707
void combine_sub(BV& bv, It first, It last)
709
typename BV::blocks_manager_type& bman = bv.get_blocks_manager();
710
unsigned max_id = bv.size();
714
unsigned nblock = unsigned((*first) >> bm::set_block_shift);
715
It right = block_range_scan(first, last, nblock, &max_id);
717
if (max_id >= bv.size())
718
bv.resize(max_id + 1);
720
// now we have one in-block array of bits to set
726
bman.check_allocate_block(nblock,
728
bv.get_new_blocks_strat(),
734
if (block_type == 1) // gap
736
bm::gap_word_t* gap_blk = BMGAP_PTR(blk);
737
unsigned threshold = bm::gap_limit(gap_blk, bman.glen());
739
for (; first < right; ++first)
742
unsigned nbit = (*first) & bm::set_block_mask;
744
is_set = gap_test(gap_blk, nbit);
748
unsigned new_block_len =
749
gap_set_value(false, gap_blk, nbit, &is_set);
750
if (new_block_len > threshold)
752
bman.extend_gap_block(nblock, gap_blk);
754
goto label1; // block pointer changed - goto reset
760
for (; first < right; ++first)
762
unsigned nbit = unsigned(*first & bm::set_block_mask);
763
unsigned nword = unsigned(nbit >> bm::set_word_shift);
764
nbit &= bm::set_word_mask;
765
blk[nword] &= ~((bm::word_t)1 << nbit);
774
\brief AND Combine bitvector and the iterable sequence
776
Algorithm combines bvector with sequence of integers
777
(represents another set). When the agrument set is sorted
778
this method can give serious increase in performance.
780
\param bv - destination bitvector
781
\param first - first element of the iterated sequence
782
\param last - last element of the iterated sequence
786
template<class BV, class It>
787
void combine_and(BV& bv, It first, It last)
790
combine_or(bv_tmp, first, last);
795
\brief Compute number of bit intervals (GAPs) in the bitvector
797
Algorithm traverses bit vector and count number of uninterrupted
798
intervals of 1s and 0s.
801
00001111100000 - gives us 3 intervals
802
10001111100000 - 4 intervals
803
00000000000000 - 1 interval
804
11111111111111 - 1 interval
806
\return Number of intervals
810
bm::id_t count_intervals(const BV& bv)
812
const typename BV::blocks_manager_type& bman = bv.get_blocks_manager();
814
bm::word_t*** blk_root = bman.get_rootblock();
815
typename BV::blocks_manager_type::block_count_change_func func(bman);
816
for_each_block(blk_root, bman.top_block_size(), bm::set_array_size, func);
822
\brief Export bitset from an array of binary data representing
825
Input array can be array of ints, chars or other native C types.
826
Method works with iterators, which makes it compatible with
827
STL containers and C arrays
829
\param bv - destination bitvector
830
\param first - first element of the iterated sequence
831
\param last - last element of the iterated sequence
835
template<class BV, class It>
836
void export_array(BV& bv, It first, It last)
838
typename BV::blocks_manager_type& bman = bv.get_blocks_manager();
839
unsigned inp_word_size = sizeof(*first);
840
size_t array_size = last - first;
841
size_t bit_cnt = array_size * inp_word_size * 8;
844
unsigned b1, b2, b3, b4;
846
if (bit_cnt >= bv.size())
847
bv.resize((bm::id_t)bit_cnt + 1);
849
bv.set_range((bm::id_t)bit_cnt, bv.size() - 1, false);
851
switch (inp_word_size)
855
size_t word_cnt = array_size / 4;
856
for (unsigned i = 0; i < bm::set_total_blocks; ++i)
859
bman.check_allocate_block(i,
863
false /*no NULL ret*/);
864
if (block_type == 1) // gap
866
blk = bman.convert_gap2bitset(i, BMGAP_PTR(blk));
869
bm::word_t* wrd_ptr = blk;
870
if (word_cnt >= bm::set_block_size) {
871
bm::word_t* wrd_end = blk + bm::set_block_size;
873
b1 = *first++; b2 = *first++;
874
b3 = *first++; b4 = *first++;
875
tmp = b1 | (b2 << 8) | (b3 << 16) | (b4 << 24);
877
} while (wrd_ptr < wrd_end);
878
word_cnt -= bm::set_block_size;
882
size_t to_convert = last - first;
883
for (size_t j = 0; j < to_convert / 4; ++j)
885
b1 = *first++; b2 = *first++;
886
b3 = *first++; b4 = *first++;
887
tmp = b1 | (b2 << 8) | (b3 << 16) | (b4 << 24);
892
if (first == last) break;
894
if (first == last) break;
895
*wrd_ptr |= (*first++) << 8;
896
if (first == last) break;
897
*wrd_ptr |= (*first++) << 16;
898
if (first == last) break;
899
*wrd_ptr |= (*first++) << 24;
903
if (first == last) break;
909
size_t word_cnt = array_size / 2;
910
for (unsigned i = 0; i < bm::set_total_blocks; ++i)
913
bman.check_allocate_block(i,
917
false /*no NULL ret*/);
918
if (block_type == 1) // gap
920
blk = bman.convert_gap2bitset(i, BMGAP_PTR(blk));
923
bm::word_t* wrd_ptr = blk;
924
if (word_cnt >= bm::set_block_size) {
925
bm::word_t* wrd_end = blk + bm::set_block_size;
927
b1 = *first++; b2 = *first++;
928
tmp = b1 | (b2 << 16);
930
} while (wrd_ptr < wrd_end);
931
word_cnt -= bm::set_block_size;
935
size_t to_convert = last - first;
936
for (unsigned j = 0; j < to_convert / 2; ++j)
938
b1 = *first++; b2 = *first++;
939
tmp = b1 | (b2 << 16);
944
if (first == last) break;
946
if (first == last) break;
947
*wrd_ptr |= (*first++) << 16;
951
if (first == last) break;
957
size_t word_cnt = array_size;
958
for (unsigned i = 0; i < bm::set_total_blocks; ++i)
961
bman.check_allocate_block(i,
965
false /*no NULL ret*/);
966
if (block_type == 1) // gap
968
blk = bman.convert_gap2bitset(i, BMGAP_PTR(blk));
971
bm::word_t* wrd_ptr = blk;
972
if (word_cnt >= bm::set_block_size) {
973
bm::word_t* wrd_end = blk + bm::set_block_size;
975
*wrd_ptr++ = *first++;
976
} while (wrd_ptr < wrd_end);
977
word_cnt -= bm::set_block_size;
983
if (first == last) break;
988
if (first == last) break;
1001
#include "bmundef.h"
2
Copyright(c) 2002-2005 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
4
Permission is hereby granted, free of charge, to any person
5
obtaining a copy of this software and associated documentation
6
files (the "Software"), to deal in the Software without restriction,
7
including without limitation the rights to use, copy, modify, merge,
8
publish, distribute, sublicense, and/or sell copies of the Software,
9
and to permit persons to whom the Software is furnished to do so,
10
subject to the following conditions:
12
The above copyright notice and this permission notice shall be included
13
in all copies or substantial portions of the Software.
15
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
19
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21
OTHER DEALINGS IN THE SOFTWARE.
23
For more information please visit: http://bmagic.sourceforge.net
27
#ifndef BMALGO__H__INCLUDED__
28
#define BMALGO__H__INCLUDED__
34
#include "bmalgo_impl.h"