123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256 |
- /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
- /* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
- /*
- * A counting Bloom filter implementation. This allows consumers to
- * do fast probabilistic "is item X in set Y?" testing which will
- * never answer "no" when the correct answer is "yes" (but might
- * incorrectly answer "yes" when the correct answer is "no").
- */
- #ifndef mozilla_BloomFilter_h
- #define mozilla_BloomFilter_h
- #include "mozilla/Assertions.h"
- #include "mozilla/Likely.h"
- #include <stdint.h>
- #include <string.h>
- namespace mozilla {
- /*
- * This class implements a counting Bloom filter as described at
- * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
- * 8-bit counters. This allows quick probabilistic answers to the
- * question "is object X in set Y?" where the contents of Y might not
- * be time-invariant. The probabilistic nature of the test means that
- * sometimes the answer will be "yes" when it should be "no". If the
- * answer is "no", then X is guaranteed not to be in Y.
- *
- * The filter is parametrized on KeySize, which is the size of the key
- * generated by each of hash functions used by the filter, in bits,
- * and the type of object T being added and removed. T must implement
- * a |uint32_t hash() const| method which returns a uint32_t hash key
- * that will be used to generate the two separate hash functions for
- * the Bloom filter. This hash key MUST be well-distributed for good
- * results! KeySize is not allowed to be larger than 16.
- *
- * The filter uses exactly 2**KeySize bytes of memory. From now on we
- * will refer to the memory used by the filter as M.
- *
- * The expected rate of incorrect "yes" answers depends on M and on
- * the number N of objects in set Y. As long as N is small compared
- * to M, the rate of such answers is expected to be approximately
- * 4*(N/M)**2 for this filter. In practice, if Y has a few hundred
- * elements then using a KeySize of 12 gives a reasonably low
- * incorrect answer rate. A KeySize of 12 has the additional benefit
- * of using exactly one page for the filter in typical hardware
- * configurations.
- */
- template<unsigned KeySize, class T>
- class BloomFilter
- {
- /*
- * A counting Bloom filter with 8-bit counters. For now we assume
- * that having two hash functions is enough, but we may revisit that
- * decision later.
- *
- * The filter uses an array with 2**KeySize entries.
- *
- * Assuming a well-distributed hash function, a Bloom filter with
- * array size M containing N elements and
- * using k hash function has expected false positive rate exactly
- *
- * $ (1 - (1 - 1/M)^{kN})^k $
- *
- * because each array slot has a
- *
- * $ (1 - 1/M)^{kN} $
- *
- * chance of being 0, and the expected false positive rate is the
- * probability that all of the k hash functions will hit a nonzero
- * slot.
- *
- * For reasonable assumptions (M large, kN large, which should both
- * hold if we're worried about false positives) about M and kN this
- * becomes approximately
- *
- * $$ (1 - \exp(-kN/M))^k $$
- *
- * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
- * or in other words
- *
- * $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$
- *
- * where r is the false positive rate. This can be used to compute
- * the desired KeySize for a given load N and false positive rate r.
- *
- * If N/M is assumed small, then the false positive rate can
- * further be approximated as 4*N^2/M^2. So increasing KeySize by
- * 1, which doubles M, reduces the false positive rate by about a
- * factor of 4, and a false positive rate of 1% corresponds to
- * about M/N == 20.
- *
- * What this means in practice is that for a few hundred keys using a
- * KeySize of 12 gives false positive rates on the order of 0.25-4%.
- *
- * Similarly, using a KeySize of 10 would lead to a 4% false
- * positive rate for N == 100 and to quite bad false positive
- * rates for larger N.
- */
- public:
- BloomFilter()
- {
- static_assert(KeySize <= kKeyShift, "KeySize too big");
- // Should we have a custom operator new using calloc instead and
- // require that we're allocated via the operator?
- clear();
- }
- /*
- * Clear the filter. This should be done before reusing it, because
- * just removing all items doesn't clear counters that hit the upper
- * bound.
- */
- void clear();
- /*
- * Add an item to the filter.
- */
- void add(const T* aValue);
- /*
- * Remove an item from the filter.
- */
- void remove(const T* aValue);
- /*
- * Check whether the filter might contain an item. This can
- * sometimes return true even if the item is not in the filter,
- * but will never return false for items that are actually in the
- * filter.
- */
- bool mightContain(const T* aValue) const;
- /*
- * Methods for add/remove/contain when we already have a hash computed
- */
- void add(uint32_t aHash);
- void remove(uint32_t aHash);
- bool mightContain(uint32_t aHash) const;
- private:
- static const size_t kArraySize = (1 << KeySize);
- static const uint32_t kKeyMask = (1 << KeySize) - 1;
- static const uint32_t kKeyShift = 16;
- static uint32_t hash1(uint32_t aHash)
- {
- return aHash & kKeyMask;
- }
- static uint32_t hash2(uint32_t aHash)
- {
- return (aHash >> kKeyShift) & kKeyMask;
- }
- uint8_t& firstSlot(uint32_t aHash)
- {
- return mCounters[hash1(aHash)];
- }
- uint8_t& secondSlot(uint32_t aHash)
- {
- return mCounters[hash2(aHash)];
- }
- const uint8_t& firstSlot(uint32_t aHash) const
- {
- return mCounters[hash1(aHash)];
- }
- const uint8_t& secondSlot(uint32_t aHash) const
- {
- return mCounters[hash2(aHash)];
- }
- static bool full(const uint8_t& aSlot) { return aSlot == UINT8_MAX; }
- uint8_t mCounters[kArraySize];
- };
- template<unsigned KeySize, class T>
- inline void
- BloomFilter<KeySize, T>::clear()
- {
- memset(mCounters, 0, kArraySize);
- }
- template<unsigned KeySize, class T>
- inline void
- BloomFilter<KeySize, T>::add(uint32_t aHash)
- {
- uint8_t& slot1 = firstSlot(aHash);
- if (MOZ_LIKELY(!full(slot1))) {
- ++slot1;
- }
- uint8_t& slot2 = secondSlot(aHash);
- if (MOZ_LIKELY(!full(slot2))) {
- ++slot2;
- }
- }
- template<unsigned KeySize, class T>
- MOZ_ALWAYS_INLINE void
- BloomFilter<KeySize, T>::add(const T* aValue)
- {
- uint32_t hash = aValue->hash();
- return add(hash);
- }
- template<unsigned KeySize, class T>
- inline void
- BloomFilter<KeySize, T>::remove(uint32_t aHash)
- {
- // If the slots are full, we don't know whether we bumped them to be
- // there when we added or not, so just leave them full.
- uint8_t& slot1 = firstSlot(aHash);
- if (MOZ_LIKELY(!full(slot1))) {
- --slot1;
- }
- uint8_t& slot2 = secondSlot(aHash);
- if (MOZ_LIKELY(!full(slot2))) {
- --slot2;
- }
- }
- template<unsigned KeySize, class T>
- MOZ_ALWAYS_INLINE void
- BloomFilter<KeySize, T>::remove(const T* aValue)
- {
- uint32_t hash = aValue->hash();
- remove(hash);
- }
- template<unsigned KeySize, class T>
- MOZ_ALWAYS_INLINE bool
- BloomFilter<KeySize, T>::mightContain(uint32_t aHash) const
- {
- // Check that all the slots for this hash contain something
- return firstSlot(aHash) && secondSlot(aHash);
- }
- template<unsigned KeySize, class T>
- MOZ_ALWAYS_INLINE bool
- BloomFilter<KeySize, T>::mightContain(const T* aValue) const
- {
- uint32_t hash = aValue->hash();
- return mightContain(hash);
- }
- } // namespace mozilla
- #endif /* mozilla_BloomFilter_h */
|