From ddb823f4baa94ca49767278f0c488e54f7802df6 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Wed, 16 Oct 2024 17:50:20 +0200 Subject: [PATCH] [MISC] Add max_bin_elements to IBF constructor Sometimes the maximum bin elements might be known. For example, in Raptor, I know the maximum count for minimizer files without going through them. --- include/hibf/interleaved_bloom_filter.hpp | 18 ++++++++---- src/interleaved_bloom_filter.cpp | 26 +++++++++++------ .../hibf/interleaved_bloom_filter_test.cpp | 28 +++++++++++++++++++ 3 files changed, 58 insertions(+), 14 deletions(-) diff --git a/include/hibf/interleaved_bloom_filter.hpp b/include/hibf/interleaved_bloom_filter.hpp index d53c425d..e73a52cf 100644 --- a/include/hibf/interleaved_bloom_filter.hpp +++ b/include/hibf/interleaved_bloom_filter.hpp @@ -207,13 +207,11 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector interleaved_bloom_filter & operator=(interleaved_bloom_filter &&) noexcept = default; //!< Defaulted. ~interleaved_bloom_filter() = default; //!< Defaulted. - /*!\brief Construct an uncompressed Interleaved Bloom Filter. + /*!\brief Construct an Interleaved Bloom Filter. * \param bins_ The number of bins. * \param size The bitvector size. * \param funs The number of hash functions. Default 2. At least 1, at most 5. * - * \attention This constructor can only be used to construct **uncompressed** Interleaved Bloom Filters. - * * \details * * ### Example @@ -224,8 +222,18 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector seqan::hibf::bin_size size, seqan::hibf::hash_function_count funs = seqan::hibf::hash_function_count{2u}); - //!\brief Construct from seqan::hibf::config. - interleaved_bloom_filter(config & configuration); + /*!\brief Construct an Interleaved Bloom Filter. + * \param configuration The seqan::hibf::config. + * \param max_bin_elements Optional, the maximum number of unique elements in any bin. + * \details + * + * If `max_bin_elements` is not passed, or `max_bin_elements` is 0, the maximum number of unique elements in any bin + * will be determined automatically. + * + * `max_bin_elements` must be the maximum number of unique elements for any bin as evaluated with + * `seqan::hibf::config::input_fn`. + */ + interleaved_bloom_filter(config & configuration, size_t const max_bin_elements = 0u); //!\} /*!\name Modifiers diff --git a/src/interleaved_bloom_filter.cpp b/src/interleaved_bloom_filter.cpp index f55f686b..195ad487 100644 --- a/src/interleaved_bloom_filter.cpp +++ b/src/interleaved_bloom_filter.cpp @@ -43,20 +43,28 @@ interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count bins_, resize(technical_bins * bin_size_); } -size_t max_bin_size(config & configuration) +size_t max_bin_size(config & configuration, size_t const max_bin_elements) { configuration.validate_and_set_defaults(); size_t max_size{}; - robin_hood::unordered_flat_set kmers; -#pragma omp parallel for schedule(dynamic) num_threads(configuration.threads) private(kmers) - for (size_t i = 0u; i < configuration.number_of_user_bins; ++i) + + if (max_bin_elements == 0u) { - kmers.clear(); - configuration.input_fn(i, insert_iterator{kmers}); + robin_hood::unordered_flat_set kmers; +#pragma omp parallel for schedule(dynamic) num_threads(configuration.threads) private(kmers) + for (size_t i = 0u; i < configuration.number_of_user_bins; ++i) + { + kmers.clear(); + configuration.input_fn(i, insert_iterator{kmers}); #pragma omp critical - max_size = std::max(max_size, kmers.size()); + max_size = std::max(max_size, kmers.size()); + } + } + else + { + max_size = max_bin_elements; } return build::bin_size_in_bits({.fpr = configuration.maximum_fpr, // @@ -65,9 +73,9 @@ size_t max_bin_size(config & configuration) } // config validation is done by max_bin_size -interleaved_bloom_filter::interleaved_bloom_filter(config & configuration) : +interleaved_bloom_filter::interleaved_bloom_filter(config & configuration, size_t const max_bin_elements) : interleaved_bloom_filter{seqan::hibf::bin_count{configuration.number_of_user_bins}, - seqan::hibf::bin_size{max_bin_size(configuration)}, + seqan::hibf::bin_size{max_bin_size(configuration, max_bin_elements)}, seqan::hibf::hash_function_count{configuration.number_of_hash_functions}} { // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores) diff --git a/test/unit/hibf/interleaved_bloom_filter_test.cpp b/test/unit/hibf/interleaved_bloom_filter_test.cpp index 145d14bd..9a3d35a8 100644 --- a/test/unit/hibf/interleaved_bloom_filter_test.cpp +++ b/test/unit/hibf/interleaved_bloom_filter_test.cpp @@ -96,6 +96,34 @@ TEST(ibf_test, construction_from_config) EXPECT_RANGE_EQ(agent.bulk_contains(0), expected_v0); } +TEST(ibf_test, construction_from_config_with_max_bin_elements) +{ + std::vector> hashes{{1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u}, {0u, 2u, 3u, 4u, 5u}}; + size_t const number_of_ub{hashes.size()}; + + seqan::hibf::config ibf_config{.input_fn = + [&](size_t const num, seqan::hibf::insert_iterator it) + { + for (auto const hash : hashes[num]) + it = hash; + }, + .number_of_user_bins = number_of_ub}; + + seqan::hibf::interleaved_bloom_filter only_config{ibf_config}; + seqan::hibf::interleaved_bloom_filter default_num_elements{ibf_config, 0u}; + seqan::hibf::interleaved_bloom_filter appropriate_num_elements{ibf_config, 10u}; + seqan::hibf::interleaved_bloom_filter larger_num_elements{ibf_config, 20u}; + + EXPECT_EQ(only_config, default_num_elements); + EXPECT_EQ(only_config, appropriate_num_elements); + EXPECT_NE(only_config, larger_num_elements); + + EXPECT_EQ(default_num_elements, appropriate_num_elements); + EXPECT_NE(default_num_elements, larger_num_elements); + + EXPECT_NE(appropriate_num_elements, larger_num_elements); +} + TEST(ibf_test, member_getter) { seqan::hibf::interleaved_bloom_filter ibf{seqan::hibf::bin_count{64u}, seqan::hibf::bin_size{1024u}};