From 7814eb3c20fdd59ead51867fb767797fb2a8b9bd Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Thu, 11 Jul 2024 14:36:34 +0200 Subject: [PATCH] [FEATURE] emplace_exists --- include/hibf/interleaved_bloom_filter.hpp | 20 ++++++++------ src/interleaved_bloom_filter.cpp | 27 +++++++++++++++++-- .../interleaved_bloom_filter_benchmark.cpp | 22 +++++++++++++-- .../hibf/interleaved_bloom_filter_test.cpp | 20 ++++++++++++++ 4 files changed, 77 insertions(+), 12 deletions(-) diff --git a/include/hibf/interleaved_bloom_filter.hpp b/include/hibf/interleaved_bloom_filter.hpp index f5be675e..e8af83ca 100644 --- a/include/hibf/interleaved_bloom_filter.hpp +++ b/include/hibf/interleaved_bloom_filter.hpp @@ -188,6 +188,10 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector return h; } + //!\brief Helper function to reduce code-duplication between emplace and emplace_exists. + template + inline auto emplace_impl(size_t const value, bin_index const bin) noexcept; + public: class membership_agent_type; // documented upon definition below template @@ -231,8 +235,6 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector * \param[in] value The raw numeric value to process. * \param[in] bin The bin index to insert into. * - * \attention This function is only available for **uncompressed** Interleaved Bloom Filters. - * * \details * * ### Example @@ -241,11 +243,17 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector */ void emplace(size_t const value, bin_index const bin) noexcept; + /*!\brief Inserts a value into a specific bin and returns whether the value already existed. + * \param[in] value The raw numeric value to process. + * \param[in] bin The bin index to insert into. + * \returns `true` if the value already existed, `false` otherwise. + * \sa seqan::hibf::interleaved_bloom_filter::emplace + */ + [[nodiscard]] bool emplace_exists(size_t const value, bin_index const bin) noexcept; + /*!\brief Clears a specific bin. * \param[in] bin The bin index to clear. * - * \attention This function is only available for **uncompressed** Interleaved Bloom Filters. - * * \details * * ### Example @@ -259,8 +267,6 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector * seqan::hibf::bin_index. * \param[in] bin_range The range of bins to clear. * - * \attention This function is only available for **uncompressed** Interleaved Bloom Filters. - * * \details * * ### Example @@ -287,8 +293,6 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector * \param[in] new_bins_ The new number of bins. * \throws std::invalid_argument If passed number of bins is smaller than current number of bins. * - * \attention This function is only available for **uncompressed** Interleaved Bloom Filters. - * \attention The new number of bins must be greater or equal to the current number of bins. * \attention This function invalidates all seqan::hibf::interleaved_bloom_filter::membership_agent_type constructed for * this Interleaved Bloom Filter. * diff --git a/src/interleaved_bloom_filter.cpp b/src/interleaved_bloom_filter.cpp index f60969fa..f55f686b 100644 --- a/src/interleaved_bloom_filter.cpp +++ b/src/interleaved_bloom_filter.cpp @@ -85,16 +85,39 @@ interleaved_bloom_filter::interleaved_bloom_filter(config & configuration) : } } -void interleaved_bloom_filter::emplace(size_t const value, bin_index const bin) noexcept +template +inline auto interleaved_bloom_filter::emplace_impl(size_t const value, bin_index const bin) noexcept { assert(bin.value < bins); + + [[maybe_unused]] bool exists{true}; + for (size_t i = 0; i < hash_funs; ++i) { size_t idx = hash_and_fit(value, hash_seeds[i]); idx += bin.value; assert(idx < size()); - (*this)[idx] = 1; + + // Constructing the reference twice for emplace_exists would impact performance. + // No difference for emplace. + seqan::hibf::bit_vector::reference bit_reference{(*this)[idx]}; + if constexpr (check_exists) + exists &= bit_reference; + bit_reference = 1; }; + + if constexpr (check_exists) + return exists; +}; + +void interleaved_bloom_filter::emplace(size_t const value, bin_index const bin) noexcept +{ + return emplace_impl(value, bin); +} + +bool interleaved_bloom_filter::emplace_exists(size_t const value, bin_index const bin) noexcept +{ + return emplace_impl(value, bin); } void interleaved_bloom_filter::clear(bin_index const bin) noexcept diff --git a/test/performance/ibf/interleaved_bloom_filter_benchmark.cpp b/test/performance/ibf/interleaved_bloom_filter_benchmark.cpp index 1edaea89..5516583a 100644 --- a/test/performance/ibf/interleaved_bloom_filter_benchmark.cpp +++ b/test/performance/ibf/interleaved_bloom_filter_benchmark.cpp @@ -93,7 +93,8 @@ inline benchmark::Counter elements_per_second(size_t const count) return benchmark::Counter(count, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1000); } -void emplace_benchmark(::benchmark::State & state) +template +inline void emplace_benchmark_impl(::benchmark::State & state) { auto const & [values, original_ibf] = set_up(state); @@ -107,17 +108,33 @@ void emplace_benchmark(::benchmark::State & state) for (auto _ : state) { size_t bin_index = 0u; + [[maybe_unused]] size_t result{}; for (auto && chunk : seqan::stl::views::chunk(values, chunk_size)) { for (auto value : chunk) - ibf.emplace(value, seqan::hibf::bin_index{bin_index}); + if constexpr (check_exists) + result += ibf.emplace_exists(value, seqan::hibf::bin_index{bin_index}); + else + ibf.emplace(value, seqan::hibf::bin_index{bin_index}); ++bin_index; } + if constexpr (check_exists) + benchmark::DoNotOptimize(result); } state.counters["elements"] = elements_per_second(number_of_elements); } +void emplace_benchmark(::benchmark::State & state) +{ + emplace_benchmark_impl(state); +} + +void emplace_exists_benchmark(::benchmark::State & state) +{ + emplace_benchmark_impl(state); +} + void clear_benchmark(::benchmark::State & state) { auto const & [values, original_ibf] = set_up(state); @@ -193,6 +210,7 @@ void bulk_count_benchmark(::benchmark::State & state) } BENCHMARK(emplace_benchmark)->RangeMultiplier(2)->Range(64, 1024); +BENCHMARK(emplace_exists_benchmark)->RangeMultiplier(2)->Range(64, 1024); BENCHMARK(clear_benchmark)->RangeMultiplier(2)->Range(64, 1024); BENCHMARK(clear_range_benchmark)->RangeMultiplier(2)->Range(64, 1024); BENCHMARK(bulk_contains_benchmark)->RangeMultiplier(2)->Range(64, 1024); diff --git a/test/unit/hibf/interleaved_bloom_filter_test.cpp b/test/unit/hibf/interleaved_bloom_filter_test.cpp index 4b2987fb..2144d7f3 100644 --- a/test/unit/hibf/interleaved_bloom_filter_test.cpp +++ b/test/unit/hibf/interleaved_bloom_filter_test.cpp @@ -169,6 +169,26 @@ TEST(ibf_test, emplace) } } +TEST(ibf_test, emplace_exists) +{ + // 1. Construct and emplace + seqan::hibf::interleaved_bloom_filter ibf{seqan::hibf::bin_count{128u}, + seqan::hibf::bin_size{512}, + seqan::hibf::hash_function_count{2u}}; + + for (size_t bin_idx : std::views::iota(0, 64)) + for (size_t hash : std::views::iota(0, 64)) + ibf.emplace(hash, seqan::hibf::bin_index{bin_idx}); + + // 2. Test for correctness + for (size_t bin_idx : std::views::iota(0, 64)) + for (size_t hash : std::views::iota(0, 64)) + ASSERT_TRUE(ibf.emplace_exists(hash, seqan::hibf::bin_index{bin_idx})); + + for (size_t bin_idx : std::views::iota(64, 128)) + ASSERT_FALSE(ibf.emplace_exists(0u, seqan::hibf::bin_index{bin_idx})); +} + TEST(ibf_test, clear) { // 1. Construct and emplace