1
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2
/* This Source Code Form is subject to the terms of the Mozilla Public
3
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
4
* You can obtain one at http://mozilla.org/MPL/2.0/. */
7
* A counting Bloom filter implementation. This allows consumers to
8
* do fast probabilistic "is item X in set Y?" testing which will
9
* never answer "no" when the correct answer is "yes" (but might
10
* incorrectly answer "yes" when the correct answer is "no").
13
#ifndef mozilla_BloomFilter_h_
14
#define mozilla_BloomFilter_h_
16
#include "mozilla/Likely.h"
17
#include "mozilla/StandardInteger.h"
18
#include "mozilla/Util.h"
25
* This class implements a counting Bloom filter as described at
26
* <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
27
* 8-bit counters. This allows quick probabilistic answers to the
28
* question "is object X in set Y?" where the contents of Y might not
29
* be time-invariant. The probabilistic nature of the test means that
30
* sometimes the answer will be "yes" when it should be "no". If the
31
* answer is "no", then X is guaranteed not to be in Y.
33
* The filter is parametrized on KeySize, which is the size of the key
34
* generated by each of hash functions used by the filter, in bits,
35
* and the type of object T being added and removed. T must implement
36
* a |uint32_t hash() const| method which returns a uint32_t hash key
37
* that will be used to generate the two separate hash functions for
38
* the Bloom filter. This hash key MUST be well-distributed for good
39
* results! KeySize is not allowed to be larger than 16.
41
* The filter uses exactly 2**KeySize bytes of memory. From now on we
42
* will refer to the memory used by the filter as M.
44
* The expected rate of incorrect "yes" answers depends on M and on
45
* the number N of objects in set Y. As long as N is small compared
46
* to M, the rate of such answers is expected to be approximately
47
* 4*(N/M)**2 for this filter. In practice, if Y has a few hundred
48
* elements then using a KeySize of 12 gives a reasonably low
49
* incorrect answer rate. A KeySize of 12 has the additional benefit
50
* of using exactly one page for the filter in typical hardware
54
template<unsigned KeySize, class T>
57
* A counting Bloom filter with 8-bit counters. For now we assume
58
* that having two hash functions is enough, but we may revisit that
61
* The filter uses an array with 2**KeySize entries.
63
* Assuming a well-distributed hash function, a Bloom filter with
64
* array size M containing N elements and
65
* using k hash function has expected false positive rate exactly
67
* $ (1 - (1 - 1/M)^{kN})^k $
69
* because each array slot has a
73
* chance of being 0, and the expected false positive rate is the
74
* probability that all of the k hash functions will hit a nonzero
77
* For reasonable assumptions (M large, kN large, which should both
78
* hold if we're worried about false positives) about M and kN this
79
* becomes approximately
81
* $$ (1 - \exp(-kN/M))^k $$
83
* For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
86
* $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$
88
* where r is the false positive rate. This can be used to compute
89
* the desired KeySize for a given load N and false positive rate r.
91
* If N/M is assumed small, then the false positive rate can
92
* further be approximated as 4*N^2/M^2. So increasing KeySize by
93
* 1, which doubles M, reduces the false positive rate by about a
94
* factor of 4, and a false positive rate of 1% corresponds to
97
* What this means in practice is that for a few hundred keys using a
98
* KeySize of 12 gives false positive rates on the order of 0.25-4%.
100
* Similarly, using a KeySize of 10 would lead to a 4% false
101
* positive rate for N == 100 and to quite bad false positive
102
* rates for larger N.
106
MOZ_STATIC_ASSERT(KeySize <= keyShift, "KeySize too big");
108
// Should we have a custom operator new using calloc instead and
109
// require that we're allocated via the operator?
114
* Clear the filter. This should be done before reusing it, because
115
* just removing all items doesn't clear counters that hit the upper
121
* Add an item to the filter.
123
void add(const T* t);
126
* Remove an item from the filter.
128
void remove(const T* t);
131
* Check whether the filter might contain an item. This can
132
* sometimes return true even if the item is not in the filter,
133
* but will never return false for items that are actually in the
136
bool mightContain(const T* t) const;
139
* Methods for add/remove/contain when we already have a hash computed
141
void add(uint32_t hash);
142
void remove(uint32_t hash);
143
bool mightContain(uint32_t hash) const;
146
static const size_t arraySize = (1 << KeySize);
147
static const uint32_t keyMask = (1 << KeySize) - 1;
148
static const uint32_t keyShift = 16;
150
static uint32_t hash1(uint32_t hash) { return hash & keyMask; }
151
static uint32_t hash2(uint32_t hash) { return (hash >> keyShift) & keyMask; }
153
uint8_t& firstSlot(uint32_t hash) { return counters[hash1(hash)]; }
154
uint8_t& secondSlot(uint32_t hash) { return counters[hash2(hash)]; }
155
const uint8_t& firstSlot(uint32_t hash) const { return counters[hash1(hash)]; }
156
const uint8_t& secondSlot(uint32_t hash) const { return counters[hash2(hash)]; }
158
static bool full(const uint8_t& slot) { return slot == UINT8_MAX; }
160
uint8_t counters[arraySize];
163
template<unsigned KeySize, class T>
165
BloomFilter<KeySize, T>::clear()
167
memset(counters, 0, arraySize);
170
template<unsigned KeySize, class T>
172
BloomFilter<KeySize, T>::add(uint32_t hash)
174
uint8_t& slot1 = firstSlot(hash);
175
if (MOZ_LIKELY(!full(slot1)))
178
uint8_t& slot2 = secondSlot(hash);
179
if (MOZ_LIKELY(!full(slot2)))
183
template<unsigned KeySize, class T>
184
MOZ_ALWAYS_INLINE void
185
BloomFilter<KeySize, T>::add(const T* t)
187
uint32_t hash = t->hash();
191
template<unsigned KeySize, class T>
193
BloomFilter<KeySize, T>::remove(uint32_t hash)
195
// If the slots are full, we don't know whether we bumped them to be
196
// there when we added or not, so just leave them full.
197
uint8_t& slot1 = firstSlot(hash);
198
if (MOZ_LIKELY(!full(slot1)))
201
uint8_t& slot2 = secondSlot(hash);
202
if (MOZ_LIKELY(!full(slot2)))
206
template<unsigned KeySize, class T>
207
MOZ_ALWAYS_INLINE void
208
BloomFilter<KeySize, T>::remove(const T* t)
210
uint32_t hash = t->hash();
214
template<unsigned KeySize, class T>
215
MOZ_ALWAYS_INLINE bool
216
BloomFilter<KeySize, T>::mightContain(uint32_t hash) const
218
// Check that all the slots for this hash contain something
219
return firstSlot(hash) && secondSlot(hash);
222
template<unsigned KeySize, class T>
223
MOZ_ALWAYS_INLINE bool
224
BloomFilter<KeySize, T>::mightContain(const T* t) const
226
uint32_t hash = t->hash();
227
return mightContain(hash);
230
} // namespace mozilla
232
#endif /* mozilla_BloomFilter_h_ */