Skip to content

Commit

Permalink
[MISC] Add max_bin_elements to IBF constructor
Browse files Browse the repository at this point in the history
Sometimes the maximum bin elements might be known. For example, in Raptor, I know the maximum count for minimizer files without going through them.
  • Loading branch information
eseiler committed Oct 18, 2024
1 parent dbbfb3d commit ddb823f
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 14 deletions.
18 changes: 13 additions & 5 deletions include/hibf/interleaved_bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,13 +207,11 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
interleaved_bloom_filter & operator=(interleaved_bloom_filter &&) noexcept = default; //!< Defaulted.
~interleaved_bloom_filter() = default; //!< Defaulted.

/*!\brief Construct an uncompressed Interleaved Bloom Filter.
/*!\brief Construct an Interleaved Bloom Filter.
* \param bins_ The number of bins.
* \param size The bitvector size.
* \param funs The number of hash functions. Default 2. At least 1, at most 5.
*
* \attention This constructor can only be used to construct **uncompressed** Interleaved Bloom Filters.
*
* \details
*
* ### Example
Expand All @@ -224,8 +222,18 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
seqan::hibf::bin_size size,
seqan::hibf::hash_function_count funs = seqan::hibf::hash_function_count{2u});

//!\brief Construct from seqan::hibf::config.
interleaved_bloom_filter(config & configuration);
/*!\brief Construct an Interleaved Bloom Filter.
* \param configuration The seqan::hibf::config.
* \param max_bin_elements Optional, the maximum number of unique elements in any bin.
* \details
*
* If `max_bin_elements` is not passed, or `max_bin_elements` is 0, the maximum number of unique elements in any bin
* will be determined automatically.
*
* `max_bin_elements` must be the maximum number of unique elements for any bin as evaluated with
* `seqan::hibf::config::input_fn`.
*/
interleaved_bloom_filter(config & configuration, size_t const max_bin_elements = 0u);
//!\}

/*!\name Modifiers
Expand Down
26 changes: 17 additions & 9 deletions src/interleaved_bloom_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,20 +43,28 @@ interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count bins_,
resize(technical_bins * bin_size_);
}

size_t max_bin_size(config & configuration)
size_t max_bin_size(config & configuration, size_t const max_bin_elements)
{
configuration.validate_and_set_defaults();

size_t max_size{};
robin_hood::unordered_flat_set<uint64_t> kmers;
#pragma omp parallel for schedule(dynamic) num_threads(configuration.threads) private(kmers)
for (size_t i = 0u; i < configuration.number_of_user_bins; ++i)

if (max_bin_elements == 0u)
{
kmers.clear();
configuration.input_fn(i, insert_iterator{kmers});
robin_hood::unordered_flat_set<uint64_t> kmers;
#pragma omp parallel for schedule(dynamic) num_threads(configuration.threads) private(kmers)
for (size_t i = 0u; i < configuration.number_of_user_bins; ++i)
{
kmers.clear();
configuration.input_fn(i, insert_iterator{kmers});

#pragma omp critical
max_size = std::max(max_size, kmers.size());
max_size = std::max(max_size, kmers.size());
}
}
else
{
max_size = max_bin_elements;
}

return build::bin_size_in_bits({.fpr = configuration.maximum_fpr, //
Expand All @@ -65,9 +73,9 @@ size_t max_bin_size(config & configuration)
}

// config validation is done by max_bin_size
interleaved_bloom_filter::interleaved_bloom_filter(config & configuration) :
interleaved_bloom_filter::interleaved_bloom_filter(config & configuration, size_t const max_bin_elements) :
interleaved_bloom_filter{seqan::hibf::bin_count{configuration.number_of_user_bins},
seqan::hibf::bin_size{max_bin_size(configuration)},
seqan::hibf::bin_size{max_bin_size(configuration, max_bin_elements)},
seqan::hibf::hash_function_count{configuration.number_of_hash_functions}}
{
// NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
Expand Down
28 changes: 28 additions & 0 deletions test/unit/hibf/interleaved_bloom_filter_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,34 @@ TEST(ibf_test, construction_from_config)
EXPECT_RANGE_EQ(agent.bulk_contains(0), expected_v0);
}

TEST(ibf_test, construction_from_config_with_max_bin_elements)
{
std::vector<std::vector<size_t>> hashes{{1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u}, {0u, 2u, 3u, 4u, 5u}};
size_t const number_of_ub{hashes.size()};

seqan::hibf::config ibf_config{.input_fn =
[&](size_t const num, seqan::hibf::insert_iterator it)
{
for (auto const hash : hashes[num])
it = hash;
},
.number_of_user_bins = number_of_ub};

seqan::hibf::interleaved_bloom_filter only_config{ibf_config};
seqan::hibf::interleaved_bloom_filter default_num_elements{ibf_config, 0u};
seqan::hibf::interleaved_bloom_filter appropriate_num_elements{ibf_config, 10u};
seqan::hibf::interleaved_bloom_filter larger_num_elements{ibf_config, 20u};

EXPECT_EQ(only_config, default_num_elements);
EXPECT_EQ(only_config, appropriate_num_elements);
EXPECT_NE(only_config, larger_num_elements);

EXPECT_EQ(default_num_elements, appropriate_num_elements);
EXPECT_NE(default_num_elements, larger_num_elements);

EXPECT_NE(appropriate_num_elements, larger_num_elements);
}

TEST(ibf_test, member_getter)
{
seqan::hibf::interleaved_bloom_filter ibf{seqan::hibf::bin_count{64u}, seqan::hibf::bin_size{1024u}};
Expand Down

0 comments on commit ddb823f

Please sign in to comment.