Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] Track occupancy in IBF #257

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 28 additions & 2 deletions include/hibf/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ namespace seqan::hibf
* | General | seqan::hibf::config::threads | 1 | [RECOMMENDED_TO_ADAPT] |
* | Layout | seqan::hibf::config::sketch_bits | 12 | |
* | Layout | seqan::hibf::config::tmax | 0 | 0 indicates unset |
* | Layout | seqan::hibf::config::empty_bin_fraction | 0.0 | Dynamic Layout |
* | Layout | seqan::hibf::config::max_rearrangement_ratio | 0.5 | |
* | Layout | seqan::hibf::config::alpha | 1.2 | |
* | Layout | seqan::hibf::config::disable_estimate_union | false | |
Expand Down Expand Up @@ -230,6 +231,23 @@ struct config
*/
size_t tmax{};

/*!\brief The percentage of empty bins in the layout.
*
* \note Do not set this option unless you are developing an application that requires empty technical bins.
*
* Certain applications, e.g., dynamic indices, require empty technical bins in the layout. This option allows you
* to specify the fraction of tmax that should be empty bins.
* The empty bins will be present in each IBF of the generated layout.
*
* For example, if `tmax` is `64` and `empty_bin_fraction` is `0.10`, then 6 bins will be empty, i.e., not
* designated to contain any data. The resulting layout will be very similar to a layout with `tmax` set to `58`
* and no empty bins.
*
* Value must be in range [0.0,1.0).
* Recommendation: default value (0.0). This option is not recommended for general use.
*/
double empty_bin_fraction{};

/*!\brief A scaling factor to influence the amount of merged bins produced by the layout algorithm.
*
* The layout algorithm optimizes the space consumption of the resulting HIBF, but currently has no means of
Expand Down Expand Up @@ -302,6 +320,7 @@ struct config
* * seqan::hibf::config::threads must be greater than `0`.
* * seqan::hibf::config::sketch_bits must be in `[5,32]`.
* * seqan::hibf::config::tmax must be at most `18446744073709551552`.
* * seqan::hibf::config::empty_bin_fraction must be in `[0.0,1.0)`.
* * seqan::hibf::config::alpha must be positive.
* * seqan::hibf::config::max_rearrangement_ratio must be in `[0.0,1.0]`.
*
Expand All @@ -324,6 +343,7 @@ struct config
threads == other.threads &&
sketch_bits == other.sketch_bits &&
tmax == other.tmax &&
empty_bin_fraction == other.empty_bin_fraction &&
alpha == other.alpha &&
max_rearrangement_ratio == other.max_rearrangement_ratio &&
disable_estimate_union == other.disable_estimate_union &&
Expand All @@ -334,11 +354,13 @@ struct config
private:
friend class cereal::access;

static constexpr uint32_t version{2};

template <typename archive_t>
void serialize(archive_t & archive)
{
uint32_t version{1};
archive(CEREAL_NVP(version));
uint32_t parsed_version{version};
archive(cereal::make_nvp("version", parsed_version));

archive(CEREAL_NVP(number_of_user_bins));
archive(CEREAL_NVP(number_of_hash_functions));
Expand All @@ -348,6 +370,10 @@ struct config

archive(CEREAL_NVP(sketch_bits));
archive(CEREAL_NVP(tmax));

if (parsed_version > 1u)
archive(CEREAL_NVP(empty_bin_fraction));

archive(CEREAL_NVP(alpha));
archive(CEREAL_NVP(max_rearrangement_ratio));
archive(CEREAL_NVP(disable_estimate_union));
Expand Down
38 changes: 22 additions & 16 deletions include/hibf/interleaved_bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <cereal/cereal.hpp> // for make_nvp
#include <cereal/macros.hpp> // for CEREAL_SERIALIZE_FUNCTION_NAME
#include <cereal/types/base_class.hpp> // for base_class
#include <cereal/types/vector.hpp> // for vector

#include <hibf/cereal/concepts.hpp> // for cereal_archive
#include <hibf/contrib/aligned_allocator.hpp> // for aligned_allocator
Expand Down Expand Up @@ -192,10 +193,6 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
return h;
}

//!\brief Helper function to reduce code-duplication between emplace and emplace_exists.
template <bool check_exists>
inline auto emplace_impl(size_t const value, bin_index const bin) noexcept;

public:
class membership_agent_type; // documented upon definition below
template <std::integral value_t>
Expand All @@ -215,16 +212,18 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
* \param bins_ The number of bins.
* \param size The bitvector size.
* \param funs The number of hash functions. Default 2. At least 1, at most 5.
* \param track_occupancy_ Whether to track the occupancy of the bins.
*
* \details
*
* ### Example
*
* \include test/snippet/ibf/interleaved_bloom_filter_constructor.cpp
*/
interleaved_bloom_filter(seqan::hibf::bin_count bins_,
seqan::hibf::bin_size size,
seqan::hibf::hash_function_count funs = seqan::hibf::hash_function_count{2u});
interleaved_bloom_filter(seqan::hibf::bin_count const bins_,
seqan::hibf::bin_size const size,
seqan::hibf::hash_function_count const funs = seqan::hibf::hash_function_count{2u},
bool const track_occupancy_ = false);

/*!\brief Construct an Interleaved Bloom Filter.
* \param configuration The seqan::hibf::config.
Expand All @@ -249,20 +248,14 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
*
* \details
*
* If `track_occupancy` is set to `true`, the occupancy of the bin is tracked.
*
* ### Example
*
* \include test/snippet/ibf/interleaved_bloom_filter_emplace.cpp
*/
void emplace(size_t const value, bin_index const bin) noexcept;

/*!\brief Inserts a value into a specific bin and returns whether the value already existed.
* \param[in] value The raw numeric value to process.
* \param[in] bin The bin index to insert into.
* \returns `true` if the value already existed, `false` otherwise.
* \sa seqan::hibf::interleaved_bloom_filter::emplace
*/
[[nodiscard]] bool emplace_exists(size_t const value, bin_index const bin) noexcept;

/*!\brief Clears a specific bin.
* \param[in] bin The bin index to clear.
*
Expand Down Expand Up @@ -293,7 +286,7 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
"The reference type of the range to clear must be seqan::hibf::bin_index.");
#ifndef NDEBUG
for (auto && bin : bin_range)
assert(bin.value < bins);
assert(bin.value < technical_bins);
#endif // NDEBUG

for (size_t offset = 0, i = 0; i < bin_size_; offset += technical_bins, ++i)
Expand Down Expand Up @@ -438,6 +431,17 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
using base_t::data;
//!\}

/*!\brief Contains the number of unique values inserted into each bin.
* \details
* Only contains non-zero values if `track_occupancy` is true.
*
* A value is unique if inserting it into the IBF would set at least one previously unset bit.
*/
std::vector<size_t> occupancy{};

//!\brief Whether to track the occupancy of the bins.
bool track_occupancy{false};

/*!\cond DEV
* \brief The version of the HIBF.
*/
Expand All @@ -461,6 +465,8 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
archive(bin_words);
archive(hash_funs);
archive(cereal::base_class<base_t>(this));
archive(occupancy);
archive(track_occupancy);
}
//!\endcond
};
Expand Down
14 changes: 7 additions & 7 deletions include/hibf/layout/hierarchical_binning.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
#include <utility> // for pair
#include <vector> // for vector

#include <hibf/config.hpp> // for config
#include <hibf/layout/data_store.hpp> // for data_store
#include <hibf/platform.hpp> // for HIBF_WORKAROUND_GCC_BOGUS_MEMCPY
#include <hibf/config.hpp> // for config
#include <hibf/layout/data_store.hpp> // for data_store
#include <hibf/misc/subtract_empty_bins.hpp> // for subtract_empty_bins
#include <hibf/platform.hpp> // for HIBF_WORKAROUND_GCC_BOGUS_MEMCPY

namespace seqan::hibf::layout
{
Expand Down Expand Up @@ -68,10 +69,9 @@ class hierarchical_binning
config{config_},
data{std::addressof(data_)},
num_user_bins{data->positions.size()},
num_technical_bins{data->previous.empty() ? config.tmax : needed_technical_bins(num_user_bins)}
{
assert(data != nullptr);
}
num_technical_bins{data->previous.empty() ? subtract_empty_bins(config.tmax, config.empty_bin_fraction)
: needed_technical_bins(num_user_bins)}
{}

//!\brief Executes the hierarchical binning algorithm and layouts user bins into technical bins.
size_t execute();
Expand Down
33 changes: 33 additions & 0 deletions include/hibf/misc/subtract_empty_bins.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause

#pragma once

#include <algorithm>
#include <cassert>
#include <cstddef>

#include <hibf/platform.hpp>

namespace seqan::hibf
{

/*!\brief Returns the number of technical bins available for use.
* \param[in] tmax The total number of bins.
* \param[in] fraction The fraction of the total number of bins that should be empty.
* \ingroup hibf
* \sa https://godbolt.org/z/cMjbM39vj
*/
[[nodiscard]] constexpr size_t subtract_empty_bins(size_t const tmax, double const fraction) noexcept
{
// There must be at least 2 technical bins available without empty bins.
// Otherwise, there would only ever be one technical bin available.
if (fraction == 0.0 || tmax <= 2u)
return tmax;

size_t const number_of_empty_bins = std::clamp<size_t>(tmax * fraction, 1, tmax - 2);
return tmax - number_of_empty_bins;
}

} // namespace seqan::hibf
4 changes: 3 additions & 1 deletion src/build/construct_ibf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,9 @@ seqan::hibf::interleaved_bloom_filter construct_ibf(robin_hood::unordered_flat_s
local_index_allocation_timer.start();
seqan::hibf::interleaved_bloom_filter ibf{bin_count,
bin_size,
seqan::hibf::hash_function_count{data.config.number_of_hash_functions}};
seqan::hibf::hash_function_count{data.config.number_of_hash_functions},
data.config.empty_bin_fraction > 0.0};
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
data.config.empty_bin_fraction > 0.0};
data.config.empty_bin_fraction > 0.0}; // track occupancy if handling empty bins


local_index_allocation_timer.stop();
data.index_allocation_timer += local_index_allocation_timer;

Expand Down
24 changes: 22 additions & 2 deletions src/build/insert_into_ibf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,34 @@ void insert_into_ibf(robin_hood::unordered_flat_set<uint64_t> const & kmers,

serial_timer local_fill_ibf_timer{};
local_fill_ibf_timer.start();
for (auto chunk : kmers | seqan::stl::views::chunk(chunk_size))
auto chunk_view = seqan::stl::views::chunk(kmers, chunk_size);
for (auto && chunk : chunk_view)
{
assert(chunk_number < number_of_bins);
seqan::hibf::bin_index const bin_idx{bin_index + chunk_number};
++chunk_number;
for (size_t const value : chunk)
for (auto && value : chunk)
ibf.emplace(value, bin_idx);
}

assert(chunk_view.size() <= number_of_bins);
// Edge case: If there are not enough k-mers to emplace at least one value into each bin, set the occupancy of
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Edge case: If there are not enough k-mers to emplace at least one value into each bin, set the occupancy of
// Small number edge case: If there are not enough k-mers to emplace at least one value into each bin, set the occupancy of

// the left over bins to 1.
// GCOVR_EXCL_START
if (ibf.track_occupancy && chunk_view.size() < number_of_bins)
{
size_t const diff = number_of_bins - chunk_view.size();
auto it = ibf.occupancy.begin() + bin_index + chunk_view.size();
assert(std::ranges::all_of(it,
it + diff,
[](size_t value)
{
return value == 0u;
}));
std::ranges::fill_n(it, diff, 1u);
}
// GCOVR_EXCL_STOP

local_fill_ibf_timer.stop();
fill_ibf_timer += local_fill_ibf_timer;
}
Expand Down
4 changes: 4 additions & 0 deletions src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <hibf/config.hpp> // for config
#include <hibf/layout/prefixes.hpp> // for meta_header, meta_hibf_config_end, meta_hibf_config_start
#include <hibf/misc/next_multiple_of_64.hpp> // for next_multiple_of_64
#include <hibf/misc/subtract_empty_bins.hpp> // for subtract_empty_bins

namespace seqan::hibf
{
Expand Down Expand Up @@ -113,6 +114,9 @@ void config::validate_and_set_defaults()
<< "anyway, so we increased your number of technical bins to " << tmax << ".\n";
}

if (empty_bin_fraction < 0.0 || empty_bin_fraction >= 1.0)
throw std::invalid_argument{"[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0)."};

if (alpha < 0.0)
throw std::invalid_argument{"[HIBF CONFIG ERROR] config::alpha must be positive."};

Expand Down
48 changes: 20 additions & 28 deletions src/interleaved_bloom_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,15 @@ namespace seqan::hibf
# pragma GCC diagnostic ignored "-Wattributes"
#endif // HIBF_COMPILER_IS_GCC

interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count bins_,
seqan::hibf::bin_size size,
seqan::hibf::hash_function_count funs)
interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count const bins_,
seqan::hibf::bin_size const size,
seqan::hibf::hash_function_count const funs,
bool const track_occupancy_) :
bins{bins_.value},
bin_size_{size.value},
hash_funs{funs.value},
track_occupancy{track_occupancy_}
{
bins = bins_.value;
bin_size_ = size.value;
hash_funs = funs.value;

if (bins == 0)
throw std::logic_error{"The number of bins must be > 0."};
if (hash_funs == 0 || hash_funs > 5)
Expand All @@ -47,6 +48,7 @@ interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count bins_,
bin_words = divide_and_ceil(bins, 64u);
technical_bins = bin_words * 64u;
resize(technical_bins * bin_size_);
occupancy.resize(technical_bins, 0u);
}

size_t find_biggest_bin(config const & configuration)
Expand Down Expand Up @@ -101,7 +103,8 @@ size_t max_bin_size(config & configuration, size_t const max_bin_elements)
interleaved_bloom_filter::interleaved_bloom_filter(config & configuration, size_t const max_bin_elements) :
interleaved_bloom_filter{seqan::hibf::bin_count{configuration.number_of_user_bins},
seqan::hibf::bin_size{max_bin_size(configuration, max_bin_elements)},
seqan::hibf::hash_function_count{configuration.number_of_hash_functions}}
seqan::hibf::hash_function_count{configuration.number_of_hash_functions},
configuration.empty_bin_fraction > 0.0}
{
size_t const chunk_size = std::clamp<size_t>(std::bit_ceil(bin_count() / configuration.threads), 8u, 64u);

Expand All @@ -112,39 +115,27 @@ interleaved_bloom_filter::interleaved_bloom_filter(config & configuration, size_
}
}

template <bool check_exists>
inline auto interleaved_bloom_filter::emplace_impl(size_t const value, bin_index const bin) noexcept
[[gnu::always_inline]] void interleaved_bloom_filter::emplace(size_t const value, bin_index const bin) noexcept
{
assert(bin.value < bins);

[[maybe_unused]] bool exists{true};
bool exists{track_occupancy};

for (size_t i = 0; i < hash_funs; ++i)
{
size_t idx = hash_and_fit(value, hash_seeds[i]);
idx += bin.value;
size_t const idx = hash_and_fit(value, hash_seeds[i]) + bin.value;
assert(idx < size());

// Constructing the reference twice for emplace_exists would impact performance.
// Constructing the reference twice for tracking occupancy would impact performance.
// No difference for emplace.
seqan::hibf::bit_vector::reference bit_reference{(*this)[idx]};
if constexpr (check_exists)
if (track_occupancy)
exists &= bit_reference;
bit_reference = 1;
bit_reference = true;
};

if constexpr (check_exists)
return exists;
};

[[gnu::always_inline]] void interleaved_bloom_filter::emplace(size_t const value, bin_index const bin) noexcept
{
return emplace_impl<false>(value, bin);
}

[[gnu::always_inline]] bool interleaved_bloom_filter::emplace_exists(size_t const value, bin_index const bin) noexcept
{
return emplace_impl<true>(value, bin);
if (track_occupancy && !exists)
++occupancy[bin.value];
}

void interleaved_bloom_filter::clear(bin_index const bin) noexcept
Expand Down Expand Up @@ -205,6 +196,7 @@ void interleaved_bloom_filter::increase_bin_number_to(seqan::hibf::bin_count con
bins = new_bins;
bin_words = new_bin_words;
technical_bins = new_technical_bins;
occupancy.resize(technical_bins, 0u);
}

[[gnu::always_inline]] bit_vector const &
Expand Down
Loading
Loading