From 860a4db7b03378416e21714587e30a0f1144185d Mon Sep 17 00:00:00 2001 From: delocalizer Date: Thu, 27 Jul 2023 21:54:02 +1000 Subject: [PATCH] A fairly easy optimization to BloomFilter::add for applications with low fp rate (sparse array) and mostly unique items being added is to quit testing the bit array on the first failure. This is good for about a 15% speedup in practice. --- src/bloomfilter.cxx | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/bloomfilter.cxx b/src/bloomfilter.cxx index a0ca9a6..7fd91bb 100644 --- a/src/bloomfilter.cxx +++ b/src/bloomfilter.cxx @@ -26,27 +26,22 @@ bool BloomFilter::contains(const std::string& item) { uint64_t hashes[k_]; hash(item, hashes); for (size_t i = 0; i < k_; i++) { - uint64_t pos { mpow2_ ? hashes[i] & mask_ : hashes[i] % m_ }; - if (!bitset->test(pos)) { - return false; - } + if (!bitset->test(hashes[i])) { return false; } } return true; } // Add the item; return false if it was already present otherwise true. bool BloomFilter::add(const std::string& item) { - bool added { false }; uint64_t hashes[k_]; hash(item, hashes); for (size_t i = 0; i < k_; i++) { - uint64_t pos { mpow2_ ? hashes[i] & mask_ : hashes[i] % m_ }; - if (!bitset->test(pos)) { - bitset->set(pos); - added = true; - } + if (!bitset->test(hashes[i])) { goto add_the_item; } } - return added; + return false; +add_the_item: + for (size_t i = 0; i < k_; i++) { bitset->set(hashes[i]); } + return true; } // Return the estimated number of items stored. @@ -106,7 +101,7 @@ void BloomFilter::initialize(){ mpow2_ ? "bit mask" : "modulus"); } -// Generate k hash values for the item. +// Generate hash values and map onto k bitarray offsets. // // k linear combinations of just 2 independent hashes ("double hashing") has // the same asymptotic behaviour as k independent hashes. @@ -118,7 +113,7 @@ void BloomFilter::hash(const std::string& item, uint64_t* buf) { auto a { XXH3_64bits_withSeed(cstr, len, seed1_) }; auto b { XXH3_64bits_withSeed(cstr, len, seed2_) }; for (size_t i = 0; i < k_; i++) { - buf[i] = a; + buf[i] = mpow2_ ? a & mask_ : a % m_; a += b; b += i; }