From e7eb1e35f736e672e0c94dd297d134cf9e4d37c4 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Tue, 3 Dec 2024 19:04:20 +0100 Subject: [PATCH 1/2] [FEATURE] Empty bins in layout --- include/hibf/config.hpp | 30 +++++++++- include/hibf/layout/hierarchical_binning.hpp | 14 ++--- include/hibf/misc/subtract_empty_bins.hpp | 33 +++++++++++ src/config.cpp | 4 ++ src/layout/hierarchical_binning.cpp | 13 ++++- test/unit/hibf/config_test.cpp | 58 ++++++++++++++++++- .../hibf/layout/hierarchical_binning_test.cpp | 42 ++++++++++++++ 7 files changed, 181 insertions(+), 13 deletions(-) create mode 100644 include/hibf/misc/subtract_empty_bins.hpp diff --git a/include/hibf/config.hpp b/include/hibf/config.hpp index cbf6d224..20b0a322 100644 --- a/include/hibf/config.hpp +++ b/include/hibf/config.hpp @@ -40,6 +40,7 @@ namespace seqan::hibf * | General | seqan::hibf::config::threads | 1 | [RECOMMENDED_TO_ADAPT] | * | Layout | seqan::hibf::config::sketch_bits | 12 | | * | Layout | seqan::hibf::config::tmax | 0 | 0 indicates unset | + * | Layout | seqan::hibf::config::empty_bin_fraction | 0.0 | Dynamic Layout | * | Layout | seqan::hibf::config::max_rearrangement_ratio | 0.5 | | * | Layout | seqan::hibf::config::alpha | 1.2 | | * | Layout | seqan::hibf::config::disable_estimate_union | false | | @@ -230,6 +231,23 @@ struct config */ size_t tmax{}; + /*!\brief The percentage of empty bins in the layout. + * + * \note Do not set this option unless you are developing an application that requires empty technical bins. + * + * Certain applications, e.g., dynamic indices, require empty technical bins in the layout. This option allows you + * to specify the fraction of tmax that should be empty bins. + * The empty bins will be present in each IBF of the generated layout. + * + * For example, if `tmax` is `64` and `empty_bin_fraction` is `0.10`, then 6 bins will be empty, i.e., not + * designated to contain any data. The resulting layout will be very similar to a layout with `tmax` set to `58` + * and no empty bins. + * + * Value must be in range [0.0,1.0). + * Recommendation: default value (0.0). This option is not recommended for general use. + */ + double empty_bin_fraction{}; + /*!\brief A scaling factor to influence the amount of merged bins produced by the layout algorithm. * * The layout algorithm optimizes the space consumption of the resulting HIBF, but currently has no means of @@ -302,6 +320,7 @@ struct config * * seqan::hibf::config::threads must be greater than `0`. * * seqan::hibf::config::sketch_bits must be in `[5,32]`. * * seqan::hibf::config::tmax must be at most `18446744073709551552`. + * * seqan::hibf::config::empty_bin_fraction must be in `[0.0,1.0)`. * * seqan::hibf::config::alpha must be positive. * * seqan::hibf::config::max_rearrangement_ratio must be in `[0.0,1.0]`. * @@ -324,6 +343,7 @@ struct config threads == other.threads && sketch_bits == other.sketch_bits && tmax == other.tmax && + empty_bin_fraction == other.empty_bin_fraction && alpha == other.alpha && max_rearrangement_ratio == other.max_rearrangement_ratio && disable_estimate_union == other.disable_estimate_union && @@ -334,11 +354,13 @@ struct config private: friend class cereal::access; + static constexpr uint32_t version{2}; + template void serialize(archive_t & archive) { - uint32_t version{1}; - archive(CEREAL_NVP(version)); + uint32_t parsed_version{version}; + archive(cereal::make_nvp("version", parsed_version)); archive(CEREAL_NVP(number_of_user_bins)); archive(CEREAL_NVP(number_of_hash_functions)); @@ -348,6 +370,10 @@ struct config archive(CEREAL_NVP(sketch_bits)); archive(CEREAL_NVP(tmax)); + + if (parsed_version > 1u) + archive(CEREAL_NVP(empty_bin_fraction)); + archive(CEREAL_NVP(alpha)); archive(CEREAL_NVP(max_rearrangement_ratio)); archive(CEREAL_NVP(disable_estimate_union)); diff --git a/include/hibf/layout/hierarchical_binning.hpp b/include/hibf/layout/hierarchical_binning.hpp index 551173d4..0a509eb3 100644 --- a/include/hibf/layout/hierarchical_binning.hpp +++ b/include/hibf/layout/hierarchical_binning.hpp @@ -10,9 +10,10 @@ #include // for pair #include // for vector -#include // for config -#include // for data_store -#include // for HIBF_WORKAROUND_GCC_BOGUS_MEMCPY +#include // for config +#include // for data_store +#include // for subtract_empty_bins +#include // for HIBF_WORKAROUND_GCC_BOGUS_MEMCPY namespace seqan::hibf::layout { @@ -68,10 +69,9 @@ class hierarchical_binning config{config_}, data{std::addressof(data_)}, num_user_bins{data->positions.size()}, - num_technical_bins{data->previous.empty() ? config.tmax : needed_technical_bins(num_user_bins)} - { - assert(data != nullptr); - } + num_technical_bins{data->previous.empty() ? subtract_empty_bins(config.tmax, config.empty_bin_fraction) + : needed_technical_bins(num_user_bins)} + {} //!\brief Executes the hierarchical binning algorithm and layouts user bins into technical bins. size_t execute(); diff --git a/include/hibf/misc/subtract_empty_bins.hpp b/include/hibf/misc/subtract_empty_bins.hpp new file mode 100644 index 00000000..df828dc0 --- /dev/null +++ b/include/hibf/misc/subtract_empty_bins.hpp @@ -0,0 +1,33 @@ +// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include +#include + +#include + +namespace seqan::hibf +{ + +/*!\brief Returns the number of technical bins available for use. + * \param[in] tmax The total number of bins. + * \param[in] fraction The fraction of the total number of bins that should be empty. + * \ingroup hibf + * \sa https://godbolt.org/z/cMjbM39vj + */ +[[nodiscard]] constexpr size_t subtract_empty_bins(size_t const tmax, double const fraction) noexcept +{ + // There must be at least 2 technical bins available without empty bins. + // Otherwise, there would only ever be one technical bin available. + if (fraction == 0.0 || tmax <= 2u) + return tmax; + + size_t const number_of_empty_bins = std::clamp(tmax * fraction, 1, tmax - 2); + return tmax - number_of_empty_bins; +} + +} // namespace seqan::hibf diff --git a/src/config.cpp b/src/config.cpp index 591769ac..8a649b20 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -17,6 +17,7 @@ #include // for config #include // for meta_header, meta_hibf_config_end, meta_hibf_config_start #include // for next_multiple_of_64 +#include // for subtract_empty_bins namespace seqan::hibf { @@ -113,6 +114,9 @@ void config::validate_and_set_defaults() << "anyway, so we increased your number of technical bins to " << tmax << ".\n"; } + if (empty_bin_fraction < 0.0 || empty_bin_fraction >= 1.0) + throw std::invalid_argument{"[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0)."}; + if (alpha < 0.0) throw std::invalid_argument{"[HIBF CONFIG ERROR] config::alpha must be positive."}; diff --git a/src/layout/hierarchical_binning.cpp b/src/layout/hierarchical_binning.cpp index 6472219c..bed6bfd3 100644 --- a/src/layout/hierarchical_binning.cpp +++ b/src/layout/hierarchical_binning.cpp @@ -19,6 +19,7 @@ #include // for simple_binning #include // for divide_and_ceil #include // for next_multiple_of_64 +#include // for subtract_empty_bins #include // for concurrent_timer #include // for HIBF_WORKAROUND_GCC_BOGUS_MEMCPY #include // for hyperloglog @@ -79,7 +80,8 @@ size_t hierarchical_binning::execute() [[nodiscard]] size_t hierarchical_binning::needed_technical_bins(size_t const requested_num_ub) const { - return std::min(next_multiple_of_64(requested_num_ub), config.tmax); + size_t const needed = std::min(next_multiple_of_64(requested_num_ub), config.tmax); + return subtract_empty_bins(needed, config.empty_bin_fraction); } [[nodiscard]] size_t hierarchical_binning::max_merge_levels(size_t const num_ubs_in_merge) const @@ -406,8 +408,10 @@ void hierarchical_binning::update_libf_data(data_store & libf_data, size_t const size_t hierarchical_binning::add_lower_level(data_store & libf_data) const { + size_t const number_of_user_bins = libf_data.positions.size(); + // now do the binning for the low-level IBF: - if (libf_data.positions.size() > config.tmax) + if (number_of_user_bins > config.tmax) { // recursively call hierarchical binning if there are still too many UBs return hierarchical_binning{libf_data, config}.execute(); // return id of maximum technical bin @@ -415,7 +419,10 @@ size_t hierarchical_binning::add_lower_level(data_store & libf_data) const else { // use simple binning to distribute remaining UBs - return simple_binning{libf_data, 0}.execute(); // return id of maximum technical bin + // Simple binning is not bound by config.tmax + size_t const num_user_bins = next_multiple_of_64(number_of_user_bins); + size_t const number_of_technical_bins = subtract_empty_bins(num_user_bins, config.empty_bin_fraction); + return simple_binning{libf_data, number_of_technical_bins}.execute(); // return id of maximum technical bin } } diff --git a/test/unit/hibf/config_test.cpp b/test/unit/hibf/config_test.cpp index c0d0c052..6eb355be 100644 --- a/test/unit/hibf/config_test.cpp +++ b/test/unit/hibf/config_test.cpp @@ -37,7 +37,7 @@ TEST(config_test, write_to) std::string const expected_file{"@HIBF_CONFIG\n" "@{\n" "@ \"hibf_config\": {\n" - "@ \"version\": 1,\n" + "@ \"version\": 2,\n" "@ \"number_of_user_bins\": 123456789,\n" "@ \"number_of_hash_functions\": 4,\n" "@ \"maximum_fpr\": 0.0001,\n" @@ -45,6 +45,7 @@ TEST(config_test, write_to) "@ \"threads\": 31,\n" "@ \"sketch_bits\": 8,\n" "@ \"tmax\": 128,\n" + "@ \"empty_bin_fraction\": 0.0,\n" "@ \"alpha\": 1.0,\n" "@ \"max_rearrangement_ratio\": 0.333,\n" "@ \"disable_estimate_union\": true,\n" @@ -57,6 +58,45 @@ TEST(config_test, write_to) } TEST(config_test, read_from) +{ + std::stringstream ss{"@HIBF_CONFIG\n" + "@{\n" + "@ \"hibf_config\": {\n" + "@ \"version\": 2,\n" + "@ \"number_of_user_bins\": 123456789,\n" + "@ \"number_of_hash_functions\": 4,\n" + "@ \"maximum_fpr\": 0.0001,\n" + "@ \"relaxed_fpr\": 0.3,\n" + "@ \"threads\": 31,\n" + "@ \"sketch_bits\": 8,\n" + "@ \"tmax\": 128,\n" + "@ \"empty_bin_fraction\": 0.5,\n" + "@ \"alpha\": 1.0,\n" + "@ \"max_rearrangement_ratio\": 0.333,\n" + "@ \"disable_estimate_union\": true,\n" + "@ \"disable_rearrangement\": false\n" + "@ }\n" + "@}\n" + "@HIBF_CONFIG_END\n"}; + + seqan::hibf::config configuration; + configuration.read_from(ss); + + EXPECT_EQ(configuration.number_of_user_bins, 123456789); + EXPECT_EQ(configuration.number_of_hash_functions, 4); + EXPECT_EQ(configuration.maximum_fpr, 0.0001); + EXPECT_EQ(configuration.relaxed_fpr, 0.3); + EXPECT_EQ(configuration.threads, 31); + EXPECT_EQ(configuration.sketch_bits, 8); + EXPECT_EQ(configuration.tmax, 128); + EXPECT_EQ(configuration.empty_bin_fraction, 0.5); + EXPECT_EQ(configuration.alpha, 1.0); + EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333); + EXPECT_EQ(configuration.disable_estimate_union, true); + EXPECT_EQ(configuration.disable_rearrangement, false); +} + +TEST(config_test, read_from_v1) { std::stringstream ss{"@HIBF_CONFIG\n" "@{\n" @@ -87,6 +127,7 @@ TEST(config_test, read_from) EXPECT_EQ(configuration.threads, 31); EXPECT_EQ(configuration.sketch_bits, 8); EXPECT_EQ(configuration.tmax, 128); + EXPECT_EQ(configuration.empty_bin_fraction, 0.0); EXPECT_EQ(configuration.alpha, 1.0); EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333); EXPECT_EQ(configuration.disable_estimate_union, true); @@ -293,6 +334,21 @@ TEST(config_test, validate_and_set_defaults) "increased your number of technical bins to 64.\n"); } + // empty_bin_fraction must be in [0.0,1.0) + { + seqan::hibf::config configuration{.input_fn = dummy_input_fn, + .number_of_user_bins = 1u, + .empty_bin_fraction = -0.1}; + EXPECT_THROW_MSG(configuration.validate_and_set_defaults(), + std::invalid_argument, + "[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0)."); + + configuration.empty_bin_fraction = 1.0; + EXPECT_THROW_MSG(configuration.validate_and_set_defaults(), + std::invalid_argument, + "[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0)."); + } + // alpha must be positive { seqan::hibf::config configuration{.input_fn = dummy_input_fn, .number_of_user_bins = 1u, .alpha = -0.1}; diff --git a/test/unit/hibf/layout/hierarchical_binning_test.cpp b/test/unit/hibf/layout/hierarchical_binning_test.cpp index 8a94d304..4b0e49aa 100644 --- a/test/unit/hibf/layout/hierarchical_binning_test.cpp +++ b/test/unit/hibf/layout/hierarchical_binning_test.cpp @@ -68,6 +68,48 @@ TEST(hierarchical_binning_test, small_example) EXPECT_RANGE_EQ(hibf_layout.user_bins, expected_user_bins); } +TEST(hierarchical_binning_test, small_example_with_empty_bins) +{ + seqan::hibf::config config; + config.tmax = 5; + config.disable_estimate_union = true; // also disables rearrangement + config.empty_bin_fraction = 0.001; + + seqan::hibf::layout::layout hibf_layout{}; + std::vector kmer_counts{500, 1000, 500, 500, 500, 500, 500, 500}; + + seqan::hibf::layout::data_store data{.hibf_layout = &hibf_layout, .kmer_counts = &kmer_counts}; + + data.fpr_correction = + seqan::hibf::layout::compute_fpr_correction({.fpr = 0.05, .hash_count = 2, .t_max = config.tmax}); + data.relaxed_fpr_correction = + seqan::hibf::layout::compute_relaxed_fpr_correction({.fpr = 0.05, .relaxed_fpr = 0.3, .hash_count = 2}); + + seqan::hibf::layout::hierarchical_binning algo{data, config}; + EXPECT_EQ(algo.execute(), 3u); // #HIGH_LEVEL_IBF max_bin_id:3 + + // The results should almost be the same as in small_example + // The max bins might differ because of the empty bin fraction + // The layout structure should be the same, except split bins differing by +-1 + + std::vector expected_max_bins{{{2}, 0}, {{3}, 43}}; + + // clang-format off + std::vector expected_user_bins{{{}, 0, 1, 7}, + {{}, 1, 1, 6}, + {{2}, 0, 22 - 1, 3}, + {{2}, 22 - 1, 21, 4}, + {{2}, 43 - 1, 21, 5}, + {{3}, 0, 42 + 1, 1}, + {{3}, 42 + 1, 11 - 1, 0}, + {{3}, 53, 11 - 1, 2}}; + + // clang-format on + + EXPECT_RANGE_EQ(hibf_layout.max_bins, expected_max_bins); + EXPECT_RANGE_EQ(hibf_layout.user_bins, expected_user_bins); +} + TEST(hierarchical_binning_test, another_example) { seqan::hibf::config config; From 5b8980e6805897c3c2e113ba7b171f577e07da94 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Tue, 10 Dec 2024 13:31:51 +0100 Subject: [PATCH 2/2] [FEATURE] Track occupancy in IBF --- include/hibf/interleaved_bloom_filter.hpp | 38 ++++++++------- src/build/construct_ibf.cpp | 4 +- src/build/insert_into_ibf.cpp | 24 +++++++++- src/interleaved_bloom_filter.cpp | 48 ++++++++----------- .../interleaved_bloom_filter_benchmark.cpp | 17 +++---- .../hibf/interleaved_bloom_filter_test.cpp | 29 +++++++++-- 6 files changed, 97 insertions(+), 63 deletions(-) diff --git a/include/hibf/interleaved_bloom_filter.hpp b/include/hibf/interleaved_bloom_filter.hpp index 57ad8e97..d9f5d4f3 100644 --- a/include/hibf/interleaved_bloom_filter.hpp +++ b/include/hibf/interleaved_bloom_filter.hpp @@ -23,6 +23,7 @@ #include // for make_nvp #include // for CEREAL_SERIALIZE_FUNCTION_NAME #include // for base_class +#include // for vector #include // for cereal_archive #include // for aligned_allocator @@ -192,10 +193,6 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector return h; } - //!\brief Helper function to reduce code-duplication between emplace and emplace_exists. - template - inline auto emplace_impl(size_t const value, bin_index const bin) noexcept; - public: class membership_agent_type; // documented upon definition below template @@ -215,6 +212,7 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector * \param bins_ The number of bins. * \param size The bitvector size. * \param funs The number of hash functions. Default 2. At least 1, at most 5. + * \param track_occupancy_ Whether to track the occupancy of the bins. * * \details * @@ -222,9 +220,10 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector * * \include test/snippet/ibf/interleaved_bloom_filter_constructor.cpp */ - interleaved_bloom_filter(seqan::hibf::bin_count bins_, - seqan::hibf::bin_size size, - seqan::hibf::hash_function_count funs = seqan::hibf::hash_function_count{2u}); + interleaved_bloom_filter(seqan::hibf::bin_count const bins_, + seqan::hibf::bin_size const size, + seqan::hibf::hash_function_count const funs = seqan::hibf::hash_function_count{2u}, + bool const track_occupancy_ = false); /*!\brief Construct an Interleaved Bloom Filter. * \param configuration The seqan::hibf::config. @@ -249,20 +248,14 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector * * \details * + * If `track_occupancy` is set to `true`, the occupancy of the bin is tracked. + * * ### Example * * \include test/snippet/ibf/interleaved_bloom_filter_emplace.cpp */ void emplace(size_t const value, bin_index const bin) noexcept; - /*!\brief Inserts a value into a specific bin and returns whether the value already existed. - * \param[in] value The raw numeric value to process. - * \param[in] bin The bin index to insert into. - * \returns `true` if the value already existed, `false` otherwise. - * \sa seqan::hibf::interleaved_bloom_filter::emplace - */ - [[nodiscard]] bool emplace_exists(size_t const value, bin_index const bin) noexcept; - /*!\brief Clears a specific bin. * \param[in] bin The bin index to clear. * @@ -293,7 +286,7 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector "The reference type of the range to clear must be seqan::hibf::bin_index."); #ifndef NDEBUG for (auto && bin : bin_range) - assert(bin.value < bins); + assert(bin.value < technical_bins); #endif // NDEBUG for (size_t offset = 0, i = 0; i < bin_size_; offset += technical_bins, ++i) @@ -438,6 +431,17 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector using base_t::data; //!\} + /*!\brief Contains the number of unique values inserted into each bin. + * \details + * Only contains non-zero values if `track_occupancy` is true. + * + * A value is unique if inserting it into the IBF would set at least one previously unset bit. + */ + std::vector occupancy{}; + + //!\brief Whether to track the occupancy of the bins. + bool track_occupancy{false}; + /*!\cond DEV * \brief The version of the HIBF. */ @@ -461,6 +465,8 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector archive(bin_words); archive(hash_funs); archive(cereal::base_class(this)); + archive(occupancy); + archive(track_occupancy); } //!\endcond }; diff --git a/src/build/construct_ibf.cpp b/src/build/construct_ibf.cpp index d44e7cca..5328c3f4 100644 --- a/src/build/construct_ibf.cpp +++ b/src/build/construct_ibf.cpp @@ -51,7 +51,9 @@ seqan::hibf::interleaved_bloom_filter construct_ibf(robin_hood::unordered_flat_s local_index_allocation_timer.start(); seqan::hibf::interleaved_bloom_filter ibf{bin_count, bin_size, - seqan::hibf::hash_function_count{data.config.number_of_hash_functions}}; + seqan::hibf::hash_function_count{data.config.number_of_hash_functions}, + data.config.empty_bin_fraction > 0.0}; + local_index_allocation_timer.stop(); data.index_allocation_timer += local_index_allocation_timer; diff --git a/src/build/insert_into_ibf.cpp b/src/build/insert_into_ibf.cpp index 45e2d432..e0517143 100644 --- a/src/build/insert_into_ibf.cpp +++ b/src/build/insert_into_ibf.cpp @@ -34,14 +34,34 @@ void insert_into_ibf(robin_hood::unordered_flat_set const & kmers, serial_timer local_fill_ibf_timer{}; local_fill_ibf_timer.start(); - for (auto chunk : kmers | seqan::stl::views::chunk(chunk_size)) + auto chunk_view = seqan::stl::views::chunk(kmers, chunk_size); + for (auto && chunk : chunk_view) { assert(chunk_number < number_of_bins); seqan::hibf::bin_index const bin_idx{bin_index + chunk_number}; ++chunk_number; - for (size_t const value : chunk) + for (auto && value : chunk) ibf.emplace(value, bin_idx); } + + assert(chunk_view.size() <= number_of_bins); + // Edge case: If there are not enough k-mers to emplace at least one value into each bin, set the occupancy of + // the left over bins to 1. + // GCOVR_EXCL_START + if (ibf.track_occupancy && chunk_view.size() < number_of_bins) + { + size_t const diff = number_of_bins - chunk_view.size(); + auto it = ibf.occupancy.begin() + bin_index + chunk_view.size(); + assert(std::ranges::all_of(it, + it + diff, + [](size_t value) + { + return value == 0u; + })); + std::ranges::fill_n(it, diff, 1u); + } + // GCOVR_EXCL_STOP + local_fill_ibf_timer.stop(); fill_ibf_timer += local_fill_ibf_timer; } diff --git a/src/interleaved_bloom_filter.cpp b/src/interleaved_bloom_filter.cpp index 60e7570e..b00a8d45 100644 --- a/src/interleaved_bloom_filter.cpp +++ b/src/interleaved_bloom_filter.cpp @@ -28,14 +28,15 @@ namespace seqan::hibf # pragma GCC diagnostic ignored "-Wattributes" #endif // HIBF_COMPILER_IS_GCC -interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count bins_, - seqan::hibf::bin_size size, - seqan::hibf::hash_function_count funs) +interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count const bins_, + seqan::hibf::bin_size const size, + seqan::hibf::hash_function_count const funs, + bool const track_occupancy_) : + bins{bins_.value}, + bin_size_{size.value}, + hash_funs{funs.value}, + track_occupancy{track_occupancy_} { - bins = bins_.value; - bin_size_ = size.value; - hash_funs = funs.value; - if (bins == 0) throw std::logic_error{"The number of bins must be > 0."}; if (hash_funs == 0 || hash_funs > 5) @@ -47,6 +48,7 @@ interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count bins_, bin_words = divide_and_ceil(bins, 64u); technical_bins = bin_words * 64u; resize(technical_bins * bin_size_); + occupancy.resize(technical_bins, 0u); } size_t find_biggest_bin(config const & configuration) @@ -101,7 +103,8 @@ size_t max_bin_size(config & configuration, size_t const max_bin_elements) interleaved_bloom_filter::interleaved_bloom_filter(config & configuration, size_t const max_bin_elements) : interleaved_bloom_filter{seqan::hibf::bin_count{configuration.number_of_user_bins}, seqan::hibf::bin_size{max_bin_size(configuration, max_bin_elements)}, - seqan::hibf::hash_function_count{configuration.number_of_hash_functions}} + seqan::hibf::hash_function_count{configuration.number_of_hash_functions}, + configuration.empty_bin_fraction > 0.0} { size_t const chunk_size = std::clamp(std::bit_ceil(bin_count() / configuration.threads), 8u, 64u); @@ -112,39 +115,27 @@ interleaved_bloom_filter::interleaved_bloom_filter(config & configuration, size_ } } -template -inline auto interleaved_bloom_filter::emplace_impl(size_t const value, bin_index const bin) noexcept +[[gnu::always_inline]] void interleaved_bloom_filter::emplace(size_t const value, bin_index const bin) noexcept { assert(bin.value < bins); - [[maybe_unused]] bool exists{true}; + bool exists{track_occupancy}; for (size_t i = 0; i < hash_funs; ++i) { - size_t idx = hash_and_fit(value, hash_seeds[i]); - idx += bin.value; + size_t const idx = hash_and_fit(value, hash_seeds[i]) + bin.value; assert(idx < size()); - // Constructing the reference twice for emplace_exists would impact performance. + // Constructing the reference twice for tracking occupancy would impact performance. // No difference for emplace. seqan::hibf::bit_vector::reference bit_reference{(*this)[idx]}; - if constexpr (check_exists) + if (track_occupancy) exists &= bit_reference; - bit_reference = 1; + bit_reference = true; }; - if constexpr (check_exists) - return exists; -}; - -[[gnu::always_inline]] void interleaved_bloom_filter::emplace(size_t const value, bin_index const bin) noexcept -{ - return emplace_impl(value, bin); -} - -[[gnu::always_inline]] bool interleaved_bloom_filter::emplace_exists(size_t const value, bin_index const bin) noexcept -{ - return emplace_impl(value, bin); + if (track_occupancy && !exists) + ++occupancy[bin.value]; } void interleaved_bloom_filter::clear(bin_index const bin) noexcept @@ -205,6 +196,7 @@ void interleaved_bloom_filter::increase_bin_number_to(seqan::hibf::bin_count con bins = new_bins; bin_words = new_bin_words; technical_bins = new_technical_bins; + occupancy.resize(technical_bins, 0u); } [[gnu::always_inline]] bit_vector const & diff --git a/test/performance/ibf/interleaved_bloom_filter_benchmark.cpp b/test/performance/ibf/interleaved_bloom_filter_benchmark.cpp index 6c6c819d..a19420e7 100644 --- a/test/performance/ibf/interleaved_bloom_filter_benchmark.cpp +++ b/test/performance/ibf/interleaved_bloom_filter_benchmark.cpp @@ -92,7 +92,7 @@ inline benchmark::Counter elements_per_second(size_t const count) return benchmark::Counter(count, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1000); } -template +template inline void emplace_benchmark_impl(::benchmark::State & state) { auto const & [values, original_ibf] = set_up(state); @@ -102,23 +102,18 @@ inline void emplace_benchmark_impl(::benchmark::State & state) seqan::hibf::interleaved_bloom_filter ibf{seqan::hibf::bin_count{original_ibf.bin_count()}, seqan::hibf::bin_size{original_ibf.bin_size()}, - seqan::hibf::hash_function_count{original_ibf.hash_function_count()}}; + seqan::hibf::hash_function_count{original_ibf.hash_function_count()}, + track_occupancy}; for (auto _ : state) { size_t bin_index = 0u; - [[maybe_unused]] size_t result{}; for (auto && chunk : seqan::stl::views::chunk(values, chunk_size)) { for (auto value : chunk) - if constexpr (check_exists) - result += ibf.emplace_exists(value, seqan::hibf::bin_index{bin_index}); - else - ibf.emplace(value, seqan::hibf::bin_index{bin_index}); + ibf.emplace(value, seqan::hibf::bin_index{bin_index}); ++bin_index; } - if constexpr (check_exists) - benchmark::DoNotOptimize(result); } state.counters["elements"] = elements_per_second(number_of_elements); @@ -129,7 +124,7 @@ void emplace_benchmark(::benchmark::State & state) emplace_benchmark_impl(state); } -void emplace_exists_benchmark(::benchmark::State & state) +void emplace_with_occupancy_benchmark(::benchmark::State & state) { emplace_benchmark_impl(state); } @@ -209,7 +204,7 @@ void bulk_count_benchmark(::benchmark::State & state) } BENCHMARK(emplace_benchmark)->RangeMultiplier(2)->Range(64, 1024); -BENCHMARK(emplace_exists_benchmark)->RangeMultiplier(2)->Range(64, 1024); +BENCHMARK(emplace_with_occupancy_benchmark)->RangeMultiplier(2)->Range(64, 1024); BENCHMARK(clear_benchmark)->RangeMultiplier(2)->Range(64, 1024); BENCHMARK(clear_range_benchmark)->RangeMultiplier(2)->Range(64, 1024); BENCHMARK(bulk_contains_benchmark)->RangeMultiplier(2)->Range(64, 1024); diff --git a/test/unit/hibf/interleaved_bloom_filter_test.cpp b/test/unit/hibf/interleaved_bloom_filter_test.cpp index 8aff0be2..da2ab6be 100644 --- a/test/unit/hibf/interleaved_bloom_filter_test.cpp +++ b/test/unit/hibf/interleaved_bloom_filter_test.cpp @@ -196,26 +196,45 @@ TEST(ibf_test, emplace) auto & res = agent.bulk_contains(hash); EXPECT_RANGE_EQ(res, expected); } + + EXPECT_EQ(ibf.occupancy.size(), 64u); + EXPECT_TRUE(std::ranges::all_of(ibf.occupancy, + [](size_t const occ) + { + return occ == 0u; + })); } -TEST(ibf_test, emplace_exists) +TEST(ibf_test, emplace_with_occupancy) { // 1. Construct and emplace seqan::hibf::interleaved_bloom_filter ibf{seqan::hibf::bin_count{128u}, seqan::hibf::bin_size{512}, - seqan::hibf::hash_function_count{2u}}; + seqan::hibf::hash_function_count{2u}, + true}; for (size_t bin_idx : std::views::iota(0, 64)) for (size_t hash : std::views::iota(0, 64)) ibf.emplace(hash, seqan::hibf::bin_index{bin_idx}); // 2. Test for correctness + + auto agent = ibf.membership_agent(); + std::vector expected(128); + std::fill(expected.begin(), expected.begin() + 64u, true); + for (size_t hash : std::views::iota(0, 64)) + { + auto & res = agent.bulk_contains(hash); + EXPECT_RANGE_EQ(res, expected); + } + + ASSERT_EQ(ibf.occupancy.size(), 128u); + for (size_t bin_idx : std::views::iota(0, 64)) - for (size_t hash : std::views::iota(0, 64)) - ASSERT_TRUE(ibf.emplace_exists(hash, seqan::hibf::bin_index{bin_idx})); + EXPECT_NE(ibf.occupancy[bin_idx], 0u); for (size_t bin_idx : std::views::iota(64, 128)) - ASSERT_FALSE(ibf.emplace_exists(0u, seqan::hibf::bin_index{bin_idx})); + EXPECT_EQ(ibf.occupancy[bin_idx], 0u); } TEST(ibf_test, clear)