Skip to content

Commit

Permalink
[FEATURE] Track occupancy in IBF
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Dec 19, 2024
1 parent e7eb1e3 commit 5b8980e
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 63 deletions.
38 changes: 22 additions & 16 deletions include/hibf/interleaved_bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <cereal/cereal.hpp> // for make_nvp
#include <cereal/macros.hpp> // for CEREAL_SERIALIZE_FUNCTION_NAME
#include <cereal/types/base_class.hpp> // for base_class
#include <cereal/types/vector.hpp> // for vector

#include <hibf/cereal/concepts.hpp> // for cereal_archive
#include <hibf/contrib/aligned_allocator.hpp> // for aligned_allocator
Expand Down Expand Up @@ -192,10 +193,6 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
return h;
}

//!\brief Helper function to reduce code-duplication between emplace and emplace_exists.
template <bool check_exists>
inline auto emplace_impl(size_t const value, bin_index const bin) noexcept;

public:
class membership_agent_type; // documented upon definition below
template <std::integral value_t>
Expand All @@ -215,16 +212,18 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
* \param bins_ The number of bins.
* \param size The bitvector size.
* \param funs The number of hash functions. Default 2. At least 1, at most 5.
* \param track_occupancy_ Whether to track the occupancy of the bins.
*
* \details
*
* ### Example
*
* \include test/snippet/ibf/interleaved_bloom_filter_constructor.cpp
*/
interleaved_bloom_filter(seqan::hibf::bin_count bins_,
seqan::hibf::bin_size size,
seqan::hibf::hash_function_count funs = seqan::hibf::hash_function_count{2u});
interleaved_bloom_filter(seqan::hibf::bin_count const bins_,
seqan::hibf::bin_size const size,
seqan::hibf::hash_function_count const funs = seqan::hibf::hash_function_count{2u},
bool const track_occupancy_ = false);

/*!\brief Construct an Interleaved Bloom Filter.
* \param configuration The seqan::hibf::config.
Expand All @@ -249,20 +248,14 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
*
* \details
*
* If `track_occupancy` is set to `true`, the occupancy of the bin is tracked.
*
* ### Example
*
* \include test/snippet/ibf/interleaved_bloom_filter_emplace.cpp
*/
void emplace(size_t const value, bin_index const bin) noexcept;

/*!\brief Inserts a value into a specific bin and returns whether the value already existed.
* \param[in] value The raw numeric value to process.
* \param[in] bin The bin index to insert into.
* \returns `true` if the value already existed, `false` otherwise.
* \sa seqan::hibf::interleaved_bloom_filter::emplace
*/
[[nodiscard]] bool emplace_exists(size_t const value, bin_index const bin) noexcept;

/*!\brief Clears a specific bin.
* \param[in] bin The bin index to clear.
*
Expand Down Expand Up @@ -293,7 +286,7 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
"The reference type of the range to clear must be seqan::hibf::bin_index.");
#ifndef NDEBUG
for (auto && bin : bin_range)
assert(bin.value < bins);
assert(bin.value < technical_bins);
#endif // NDEBUG

for (size_t offset = 0, i = 0; i < bin_size_; offset += technical_bins, ++i)
Expand Down Expand Up @@ -438,6 +431,17 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
using base_t::data;
//!\}

/*!\brief Contains the number of unique values inserted into each bin.
* \details
* Only contains non-zero values if `track_occupancy` is true.
*
* A value is unique if inserting it into the IBF would set at least one previously unset bit.
*/
std::vector<size_t> occupancy{};

//!\brief Whether to track the occupancy of the bins.
bool track_occupancy{false};

/*!\cond DEV
* \brief The version of the HIBF.
*/
Expand All @@ -461,6 +465,8 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
archive(bin_words);
archive(hash_funs);
archive(cereal::base_class<base_t>(this));
archive(occupancy);
archive(track_occupancy);
}
//!\endcond
};
Expand Down
4 changes: 3 additions & 1 deletion src/build/construct_ibf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,9 @@ seqan::hibf::interleaved_bloom_filter construct_ibf(robin_hood::unordered_flat_s
local_index_allocation_timer.start();
seqan::hibf::interleaved_bloom_filter ibf{bin_count,
bin_size,
seqan::hibf::hash_function_count{data.config.number_of_hash_functions}};
seqan::hibf::hash_function_count{data.config.number_of_hash_functions},
data.config.empty_bin_fraction > 0.0};

local_index_allocation_timer.stop();
data.index_allocation_timer += local_index_allocation_timer;

Expand Down
24 changes: 22 additions & 2 deletions src/build/insert_into_ibf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,34 @@ void insert_into_ibf(robin_hood::unordered_flat_set<uint64_t> const & kmers,

serial_timer local_fill_ibf_timer{};
local_fill_ibf_timer.start();
for (auto chunk : kmers | seqan::stl::views::chunk(chunk_size))
auto chunk_view = seqan::stl::views::chunk(kmers, chunk_size);
for (auto && chunk : chunk_view)
{
assert(chunk_number < number_of_bins);
seqan::hibf::bin_index const bin_idx{bin_index + chunk_number};
++chunk_number;
for (size_t const value : chunk)
for (auto && value : chunk)
ibf.emplace(value, bin_idx);
}

assert(chunk_view.size() <= number_of_bins);
// Edge case: If there are not enough k-mers to emplace at least one value into each bin, set the occupancy of
// the left over bins to 1.
// GCOVR_EXCL_START
if (ibf.track_occupancy && chunk_view.size() < number_of_bins)
{
size_t const diff = number_of_bins - chunk_view.size();
auto it = ibf.occupancy.begin() + bin_index + chunk_view.size();
assert(std::ranges::all_of(it,
it + diff,
[](size_t value)
{
return value == 0u;
}));
std::ranges::fill_n(it, diff, 1u);
}
// GCOVR_EXCL_STOP

local_fill_ibf_timer.stop();
fill_ibf_timer += local_fill_ibf_timer;
}
Expand Down
48 changes: 20 additions & 28 deletions src/interleaved_bloom_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,15 @@ namespace seqan::hibf
# pragma GCC diagnostic ignored "-Wattributes"
#endif // HIBF_COMPILER_IS_GCC

interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count bins_,
seqan::hibf::bin_size size,
seqan::hibf::hash_function_count funs)
interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count const bins_,
seqan::hibf::bin_size const size,
seqan::hibf::hash_function_count const funs,
bool const track_occupancy_) :
bins{bins_.value},
bin_size_{size.value},
hash_funs{funs.value},
track_occupancy{track_occupancy_}
{
bins = bins_.value;
bin_size_ = size.value;
hash_funs = funs.value;

if (bins == 0)
throw std::logic_error{"The number of bins must be > 0."};
if (hash_funs == 0 || hash_funs > 5)
Expand All @@ -47,6 +48,7 @@ interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count bins_,
bin_words = divide_and_ceil(bins, 64u);
technical_bins = bin_words * 64u;
resize(technical_bins * bin_size_);
occupancy.resize(technical_bins, 0u);
}

size_t find_biggest_bin(config const & configuration)
Expand Down Expand Up @@ -101,7 +103,8 @@ size_t max_bin_size(config & configuration, size_t const max_bin_elements)
interleaved_bloom_filter::interleaved_bloom_filter(config & configuration, size_t const max_bin_elements) :
interleaved_bloom_filter{seqan::hibf::bin_count{configuration.number_of_user_bins},
seqan::hibf::bin_size{max_bin_size(configuration, max_bin_elements)},
seqan::hibf::hash_function_count{configuration.number_of_hash_functions}}
seqan::hibf::hash_function_count{configuration.number_of_hash_functions},
configuration.empty_bin_fraction > 0.0}
{
size_t const chunk_size = std::clamp<size_t>(std::bit_ceil(bin_count() / configuration.threads), 8u, 64u);

Expand All @@ -112,39 +115,27 @@ interleaved_bloom_filter::interleaved_bloom_filter(config & configuration, size_
}
}

template <bool check_exists>
inline auto interleaved_bloom_filter::emplace_impl(size_t const value, bin_index const bin) noexcept
[[gnu::always_inline]] void interleaved_bloom_filter::emplace(size_t const value, bin_index const bin) noexcept
{
assert(bin.value < bins);

[[maybe_unused]] bool exists{true};
bool exists{track_occupancy};

for (size_t i = 0; i < hash_funs; ++i)
{
size_t idx = hash_and_fit(value, hash_seeds[i]);
idx += bin.value;
size_t const idx = hash_and_fit(value, hash_seeds[i]) + bin.value;
assert(idx < size());

// Constructing the reference twice for emplace_exists would impact performance.
// Constructing the reference twice for tracking occupancy would impact performance.
// No difference for emplace.
seqan::hibf::bit_vector::reference bit_reference{(*this)[idx]};
if constexpr (check_exists)
if (track_occupancy)
exists &= bit_reference;
bit_reference = 1;
bit_reference = true;
};

if constexpr (check_exists)
return exists;
};

[[gnu::always_inline]] void interleaved_bloom_filter::emplace(size_t const value, bin_index const bin) noexcept
{
return emplace_impl<false>(value, bin);
}

[[gnu::always_inline]] bool interleaved_bloom_filter::emplace_exists(size_t const value, bin_index const bin) noexcept
{
return emplace_impl<true>(value, bin);
if (track_occupancy && !exists)
++occupancy[bin.value];
}

void interleaved_bloom_filter::clear(bin_index const bin) noexcept
Expand Down Expand Up @@ -205,6 +196,7 @@ void interleaved_bloom_filter::increase_bin_number_to(seqan::hibf::bin_count con
bins = new_bins;
bin_words = new_bin_words;
technical_bins = new_technical_bins;
occupancy.resize(technical_bins, 0u);
}

[[gnu::always_inline]] bit_vector const &
Expand Down
17 changes: 6 additions & 11 deletions test/performance/ibf/interleaved_bloom_filter_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ inline benchmark::Counter elements_per_second(size_t const count)
return benchmark::Counter(count, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1000);
}

template <bool check_exists>
template <bool track_occupancy>
inline void emplace_benchmark_impl(::benchmark::State & state)
{
auto const & [values, original_ibf] = set_up(state);
Expand All @@ -102,23 +102,18 @@ inline void emplace_benchmark_impl(::benchmark::State & state)

seqan::hibf::interleaved_bloom_filter ibf{seqan::hibf::bin_count{original_ibf.bin_count()},
seqan::hibf::bin_size{original_ibf.bin_size()},
seqan::hibf::hash_function_count{original_ibf.hash_function_count()}};
seqan::hibf::hash_function_count{original_ibf.hash_function_count()},
track_occupancy};

for (auto _ : state)
{
size_t bin_index = 0u;
[[maybe_unused]] size_t result{};
for (auto && chunk : seqan::stl::views::chunk(values, chunk_size))
{
for (auto value : chunk)
if constexpr (check_exists)
result += ibf.emplace_exists(value, seqan::hibf::bin_index{bin_index});
else
ibf.emplace(value, seqan::hibf::bin_index{bin_index});
ibf.emplace(value, seqan::hibf::bin_index{bin_index});
++bin_index;
}
if constexpr (check_exists)
benchmark::DoNotOptimize(result);
}

state.counters["elements"] = elements_per_second(number_of_elements);
Expand All @@ -129,7 +124,7 @@ void emplace_benchmark(::benchmark::State & state)
emplace_benchmark_impl<false>(state);
}

void emplace_exists_benchmark(::benchmark::State & state)
void emplace_with_occupancy_benchmark(::benchmark::State & state)
{
emplace_benchmark_impl<true>(state);
}
Expand Down Expand Up @@ -209,7 +204,7 @@ void bulk_count_benchmark(::benchmark::State & state)
}

BENCHMARK(emplace_benchmark)->RangeMultiplier(2)->Range(64, 1024);
BENCHMARK(emplace_exists_benchmark)->RangeMultiplier(2)->Range(64, 1024);
BENCHMARK(emplace_with_occupancy_benchmark)->RangeMultiplier(2)->Range(64, 1024);
BENCHMARK(clear_benchmark)->RangeMultiplier(2)->Range(64, 1024);
BENCHMARK(clear_range_benchmark)->RangeMultiplier(2)->Range(64, 1024);
BENCHMARK(bulk_contains_benchmark)->RangeMultiplier(2)->Range(64, 1024);
Expand Down
29 changes: 24 additions & 5 deletions test/unit/hibf/interleaved_bloom_filter_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,26 +196,45 @@ TEST(ibf_test, emplace)
auto & res = agent.bulk_contains(hash);
EXPECT_RANGE_EQ(res, expected);
}

EXPECT_EQ(ibf.occupancy.size(), 64u);
EXPECT_TRUE(std::ranges::all_of(ibf.occupancy,
[](size_t const occ)
{
return occ == 0u;
}));
}

TEST(ibf_test, emplace_exists)
TEST(ibf_test, emplace_with_occupancy)
{
// 1. Construct and emplace
seqan::hibf::interleaved_bloom_filter ibf{seqan::hibf::bin_count{128u},
seqan::hibf::bin_size{512},
seqan::hibf::hash_function_count{2u}};
seqan::hibf::hash_function_count{2u},
true};

for (size_t bin_idx : std::views::iota(0, 64))
for (size_t hash : std::views::iota(0, 64))
ibf.emplace(hash, seqan::hibf::bin_index{bin_idx});

// 2. Test for correctness

auto agent = ibf.membership_agent();
std::vector<bool> expected(128);
std::fill(expected.begin(), expected.begin() + 64u, true);
for (size_t hash : std::views::iota(0, 64))
{
auto & res = agent.bulk_contains(hash);
EXPECT_RANGE_EQ(res, expected);
}

ASSERT_EQ(ibf.occupancy.size(), 128u);

for (size_t bin_idx : std::views::iota(0, 64))
for (size_t hash : std::views::iota(0, 64))
ASSERT_TRUE(ibf.emplace_exists(hash, seqan::hibf::bin_index{bin_idx}));
EXPECT_NE(ibf.occupancy[bin_idx], 0u);

for (size_t bin_idx : std::views::iota(64, 128))
ASSERT_FALSE(ibf.emplace_exists(0u, seqan::hibf::bin_index{bin_idx}));
EXPECT_EQ(ibf.occupancy[bin_idx], 0u);
}

TEST(ibf_test, clear)
Expand Down

0 comments on commit 5b8980e

Please sign in to comment.