BloomFilter.h 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
  1. /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
  2. /* This Source Code Form is subject to the terms of the Mozilla Public
  3. * License, v. 2.0. If a copy of the MPL was not distributed with this
  4. * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
  5. /*
  6. * A counting Bloom filter implementation. This allows consumers to
  7. * do fast probabilistic "is item X in set Y?" testing which will
  8. * never answer "no" when the correct answer is "yes" (but might
  9. * incorrectly answer "yes" when the correct answer is "no").
  10. */
  11. #ifndef mozilla_BloomFilter_h
  12. #define mozilla_BloomFilter_h
  13. #include "mozilla/Assertions.h"
  14. #include "mozilla/Likely.h"
  15. #include <stdint.h>
  16. #include <string.h>
  17. namespace mozilla {
  18. /*
  19. * This class implements a counting Bloom filter as described at
  20. * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
  21. * 8-bit counters. This allows quick probabilistic answers to the
  22. * question "is object X in set Y?" where the contents of Y might not
  23. * be time-invariant. The probabilistic nature of the test means that
  24. * sometimes the answer will be "yes" when it should be "no". If the
  25. * answer is "no", then X is guaranteed not to be in Y.
  26. *
  27. * The filter is parametrized on KeySize, which is the size of the key
  28. * generated by each of hash functions used by the filter, in bits,
  29. * and the type of object T being added and removed. T must implement
  30. * a |uint32_t hash() const| method which returns a uint32_t hash key
  31. * that will be used to generate the two separate hash functions for
  32. * the Bloom filter. This hash key MUST be well-distributed for good
  33. * results! KeySize is not allowed to be larger than 16.
  34. *
  35. * The filter uses exactly 2**KeySize bytes of memory. From now on we
  36. * will refer to the memory used by the filter as M.
  37. *
  38. * The expected rate of incorrect "yes" answers depends on M and on
  39. * the number N of objects in set Y. As long as N is small compared
  40. * to M, the rate of such answers is expected to be approximately
  41. * 4*(N/M)**2 for this filter. In practice, if Y has a few hundred
  42. * elements then using a KeySize of 12 gives a reasonably low
  43. * incorrect answer rate. A KeySize of 12 has the additional benefit
  44. * of using exactly one page for the filter in typical hardware
  45. * configurations.
  46. */
  47. template<unsigned KeySize, class T>
  48. class BloomFilter
  49. {
  50. /*
  51. * A counting Bloom filter with 8-bit counters. For now we assume
  52. * that having two hash functions is enough, but we may revisit that
  53. * decision later.
  54. *
  55. * The filter uses an array with 2**KeySize entries.
  56. *
  57. * Assuming a well-distributed hash function, a Bloom filter with
  58. * array size M containing N elements and
  59. * using k hash function has expected false positive rate exactly
  60. *
  61. * $ (1 - (1 - 1/M)^{kN})^k $
  62. *
  63. * because each array slot has a
  64. *
  65. * $ (1 - 1/M)^{kN} $
  66. *
  67. * chance of being 0, and the expected false positive rate is the
  68. * probability that all of the k hash functions will hit a nonzero
  69. * slot.
  70. *
  71. * For reasonable assumptions (M large, kN large, which should both
  72. * hold if we're worried about false positives) about M and kN this
  73. * becomes approximately
  74. *
  75. * $$ (1 - \exp(-kN/M))^k $$
  76. *
  77. * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
  78. * or in other words
  79. *
  80. * $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$
  81. *
  82. * where r is the false positive rate. This can be used to compute
  83. * the desired KeySize for a given load N and false positive rate r.
  84. *
  85. * If N/M is assumed small, then the false positive rate can
  86. * further be approximated as 4*N^2/M^2. So increasing KeySize by
  87. * 1, which doubles M, reduces the false positive rate by about a
  88. * factor of 4, and a false positive rate of 1% corresponds to
  89. * about M/N == 20.
  90. *
  91. * What this means in practice is that for a few hundred keys using a
  92. * KeySize of 12 gives false positive rates on the order of 0.25-4%.
  93. *
  94. * Similarly, using a KeySize of 10 would lead to a 4% false
  95. * positive rate for N == 100 and to quite bad false positive
  96. * rates for larger N.
  97. */
  98. public:
  99. BloomFilter()
  100. {
  101. static_assert(KeySize <= kKeyShift, "KeySize too big");
  102. // Should we have a custom operator new using calloc instead and
  103. // require that we're allocated via the operator?
  104. clear();
  105. }
  106. /*
  107. * Clear the filter. This should be done before reusing it, because
  108. * just removing all items doesn't clear counters that hit the upper
  109. * bound.
  110. */
  111. void clear();
  112. /*
  113. * Add an item to the filter.
  114. */
  115. void add(const T* aValue);
  116. /*
  117. * Remove an item from the filter.
  118. */
  119. void remove(const T* aValue);
  120. /*
  121. * Check whether the filter might contain an item. This can
  122. * sometimes return true even if the item is not in the filter,
  123. * but will never return false for items that are actually in the
  124. * filter.
  125. */
  126. bool mightContain(const T* aValue) const;
  127. /*
  128. * Methods for add/remove/contain when we already have a hash computed
  129. */
  130. void add(uint32_t aHash);
  131. void remove(uint32_t aHash);
  132. bool mightContain(uint32_t aHash) const;
  133. private:
  134. static const size_t kArraySize = (1 << KeySize);
  135. static const uint32_t kKeyMask = (1 << KeySize) - 1;
  136. static const uint32_t kKeyShift = 16;
  137. static uint32_t hash1(uint32_t aHash)
  138. {
  139. return aHash & kKeyMask;
  140. }
  141. static uint32_t hash2(uint32_t aHash)
  142. {
  143. return (aHash >> kKeyShift) & kKeyMask;
  144. }
  145. uint8_t& firstSlot(uint32_t aHash)
  146. {
  147. return mCounters[hash1(aHash)];
  148. }
  149. uint8_t& secondSlot(uint32_t aHash)
  150. {
  151. return mCounters[hash2(aHash)];
  152. }
  153. const uint8_t& firstSlot(uint32_t aHash) const
  154. {
  155. return mCounters[hash1(aHash)];
  156. }
  157. const uint8_t& secondSlot(uint32_t aHash) const
  158. {
  159. return mCounters[hash2(aHash)];
  160. }
  161. static bool full(const uint8_t& aSlot) { return aSlot == UINT8_MAX; }
  162. uint8_t mCounters[kArraySize];
  163. };
  164. template<unsigned KeySize, class T>
  165. inline void
  166. BloomFilter<KeySize, T>::clear()
  167. {
  168. memset(mCounters, 0, kArraySize);
  169. }
  170. template<unsigned KeySize, class T>
  171. inline void
  172. BloomFilter<KeySize, T>::add(uint32_t aHash)
  173. {
  174. uint8_t& slot1 = firstSlot(aHash);
  175. if (MOZ_LIKELY(!full(slot1))) {
  176. ++slot1;
  177. }
  178. uint8_t& slot2 = secondSlot(aHash);
  179. if (MOZ_LIKELY(!full(slot2))) {
  180. ++slot2;
  181. }
  182. }
  183. template<unsigned KeySize, class T>
  184. MOZ_ALWAYS_INLINE void
  185. BloomFilter<KeySize, T>::add(const T* aValue)
  186. {
  187. uint32_t hash = aValue->hash();
  188. return add(hash);
  189. }
  190. template<unsigned KeySize, class T>
  191. inline void
  192. BloomFilter<KeySize, T>::remove(uint32_t aHash)
  193. {
  194. // If the slots are full, we don't know whether we bumped them to be
  195. // there when we added or not, so just leave them full.
  196. uint8_t& slot1 = firstSlot(aHash);
  197. if (MOZ_LIKELY(!full(slot1))) {
  198. --slot1;
  199. }
  200. uint8_t& slot2 = secondSlot(aHash);
  201. if (MOZ_LIKELY(!full(slot2))) {
  202. --slot2;
  203. }
  204. }
  205. template<unsigned KeySize, class T>
  206. MOZ_ALWAYS_INLINE void
  207. BloomFilter<KeySize, T>::remove(const T* aValue)
  208. {
  209. uint32_t hash = aValue->hash();
  210. remove(hash);
  211. }
  212. template<unsigned KeySize, class T>
  213. MOZ_ALWAYS_INLINE bool
  214. BloomFilter<KeySize, T>::mightContain(uint32_t aHash) const
  215. {
  216. // Check that all the slots for this hash contain something
  217. return firstSlot(aHash) && secondSlot(aHash);
  218. }
  219. template<unsigned KeySize, class T>
  220. MOZ_ALWAYS_INLINE bool
  221. BloomFilter<KeySize, T>::mightContain(const T* aValue) const
  222. {
  223. uint32_t hash = aValue->hash();
  224. return mightContain(hash);
  225. }
  226. } // namespace mozilla
  227. #endif /* mozilla_BloomFilter_h */