Skip to content

Commit

Permalink
[FEATURE] Empty bins in layout
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Dec 19, 2024
1 parent 6e89a81 commit 7c2b62e
Show file tree
Hide file tree
Showing 7 changed files with 181 additions and 13 deletions.
30 changes: 28 additions & 2 deletions include/hibf/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ namespace seqan::hibf
* | General | seqan::hibf::config::threads | 1 | [RECOMMENDED_TO_ADAPT] |
* | Layout | seqan::hibf::config::sketch_bits | 12 | |
* | Layout | seqan::hibf::config::tmax | 0 | 0 indicates unset |
* | Layout | seqan::hibf::config::empty_bin_fraction | 0.0 | Dynamic Layout |
* | Layout | seqan::hibf::config::max_rearrangement_ratio | 0.5 | |
* | Layout | seqan::hibf::config::alpha | 1.2 | |
* | Layout | seqan::hibf::config::disable_estimate_union | false | |
Expand Down Expand Up @@ -230,6 +231,23 @@ struct config
*/
size_t tmax{};

/*!\brief The percentage of empty bins in the layout.
*
* \note Do not set this option unless you are developing an application that requires empty technical bins.
*
* Certain applications, e.g., dynamic indices, require empty technical bins in the layout. This option allows you
* to specify the fraction of tmax that should be empty bins.
* The empty bins will be present in each IBF of the generated layout.
*
* For example, if `tmax` is `64` and `empty_bin_fraction` is `0.10`, then 6 bins will be empty, i.e., not
* designated to contain any data. The resulting layout will be very similar to a layout with `tmax` set to `58`
* and no empty bins.
*
* Value must be in range [0.0,1.0).
* Recommendation: default value (0.0). This option is not recommended for general use.
*/
double empty_bin_fraction{};

/*!\brief A scaling factor to influence the amount of merged bins produced by the layout algorithm.
*
* The layout algorithm optimizes the space consumption of the resulting HIBF, but currently has no means of
Expand Down Expand Up @@ -302,6 +320,7 @@ struct config
* * seqan::hibf::config::threads must be greater than `0`.
* * seqan::hibf::config::sketch_bits must be in `[5,32]`.
* * seqan::hibf::config::tmax must be at most `18446744073709551552`.
* * seqan::hibf::config::empty_bin_fraction must be in `[0.0,1.0)`.
* * seqan::hibf::config::alpha must be positive.
* * seqan::hibf::config::max_rearrangement_ratio must be in `[0.0,1.0]`.
*
Expand All @@ -324,6 +343,7 @@ struct config
threads == other.threads &&
sketch_bits == other.sketch_bits &&
tmax == other.tmax &&
empty_bin_fraction == other.empty_bin_fraction &&
alpha == other.alpha &&
max_rearrangement_ratio == other.max_rearrangement_ratio &&
disable_estimate_union == other.disable_estimate_union &&
Expand All @@ -334,11 +354,13 @@ struct config
private:
friend class cereal::access;

static constexpr uint32_t version{2};

template <typename archive_t>
void serialize(archive_t & archive)
{
uint32_t version{1};
archive(CEREAL_NVP(version));
uint32_t parsed_version{version};
archive(cereal::make_nvp("version", parsed_version));

archive(CEREAL_NVP(number_of_user_bins));
archive(CEREAL_NVP(number_of_hash_functions));
Expand All @@ -348,6 +370,10 @@ struct config

archive(CEREAL_NVP(sketch_bits));
archive(CEREAL_NVP(tmax));

if (parsed_version > 1u)
archive(CEREAL_NVP(empty_bin_fraction));

archive(CEREAL_NVP(alpha));
archive(CEREAL_NVP(max_rearrangement_ratio));
archive(CEREAL_NVP(disable_estimate_union));
Expand Down
14 changes: 7 additions & 7 deletions include/hibf/layout/hierarchical_binning.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
#include <utility> // for pair
#include <vector> // for vector

#include <hibf/config.hpp> // for config
#include <hibf/layout/data_store.hpp> // for data_store
#include <hibf/platform.hpp> // for HIBF_WORKAROUND_GCC_BOGUS_MEMCPY
#include <hibf/config.hpp> // for config
#include <hibf/layout/data_store.hpp> // for data_store
#include <hibf/misc/subtract_empty_bins.hpp> // for subtract_empty_bins
#include <hibf/platform.hpp> // for HIBF_WORKAROUND_GCC_BOGUS_MEMCPY

namespace seqan::hibf::layout
{
Expand Down Expand Up @@ -68,10 +69,9 @@ class hierarchical_binning
config{config_},
data{std::addressof(data_)},
num_user_bins{data->positions.size()},
num_technical_bins{data->previous.empty() ? config.tmax : needed_technical_bins(num_user_bins)}
{
assert(data != nullptr);
}
num_technical_bins{data->previous.empty() ? subtract_empty_bins(config.tmax, config.empty_bin_fraction)
: needed_technical_bins(num_user_bins)}
{}

//!\brief Executes the hierarchical binning algorithm and layouts user bins into technical bins.
size_t execute();
Expand Down
33 changes: 33 additions & 0 deletions include/hibf/misc/subtract_empty_bins.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause

#pragma once

#include <algorithm>
#include <cassert>
#include <cstddef>

#include <hibf/platform.hpp>

namespace seqan::hibf
{

/*!\brief Returns the number of technical bins available for use.
* \param[in] tmax The total number of bins.
* \param[in] fraction The fraction of the total number of bins that should be empty.
* \ingroup hibf
* \sa https://godbolt.org/z/cMjbM39vj
*/
[[nodiscard]] constexpr size_t subtract_empty_bins(size_t const tmax, double const fraction) noexcept
{
// There must be at least 2 technical bins available without empty bins.
// Otherwise, there would only ever be one technical bin available.
if (fraction == 0.0 || tmax <= 2u)
return tmax;

size_t const number_of_empty_bins = std::clamp<size_t>(tmax * fraction, 1, tmax - 2);
return tmax - number_of_empty_bins;
}

} // namespace seqan::hibf
4 changes: 4 additions & 0 deletions src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <hibf/config.hpp> // for config
#include <hibf/layout/prefixes.hpp> // for meta_header, meta_hibf_config_end, meta_hibf_config_start
#include <hibf/misc/next_multiple_of_64.hpp> // for next_multiple_of_64
#include <hibf/misc/subtract_empty_bins.hpp> // for subtract_empty_bins

namespace seqan::hibf
{
Expand Down Expand Up @@ -113,6 +114,9 @@ void config::validate_and_set_defaults()
<< "anyway, so we increased your number of technical bins to " << tmax << ".\n";
}

if (empty_bin_fraction < 0.0 || empty_bin_fraction >= 1.0)
throw std::invalid_argument{"[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0)."};

if (alpha < 0.0)
throw std::invalid_argument{"[HIBF CONFIG ERROR] config::alpha must be positive."};

Expand Down
13 changes: 10 additions & 3 deletions src/layout/hierarchical_binning.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <hibf/layout/simple_binning.hpp> // for simple_binning
#include <hibf/misc/divide_and_ceil.hpp> // for divide_and_ceil
#include <hibf/misc/next_multiple_of_64.hpp> // for next_multiple_of_64
#include <hibf/misc/subtract_empty_bins.hpp> // for subtract_empty_bins
#include <hibf/misc/timer.hpp> // for concurrent_timer
#include <hibf/platform.hpp> // for HIBF_WORKAROUND_GCC_BOGUS_MEMCPY
#include <hibf/sketch/hyperloglog.hpp> // for hyperloglog
Expand Down Expand Up @@ -79,7 +80,8 @@ size_t hierarchical_binning::execute()

[[nodiscard]] size_t hierarchical_binning::needed_technical_bins(size_t const requested_num_ub) const
{
return std::min<size_t>(next_multiple_of_64(requested_num_ub), config.tmax);
size_t const needed = std::min<size_t>(next_multiple_of_64(requested_num_ub), config.tmax);
return subtract_empty_bins(needed, config.empty_bin_fraction);
}

[[nodiscard]] size_t hierarchical_binning::max_merge_levels(size_t const num_ubs_in_merge) const
Expand Down Expand Up @@ -406,16 +408,21 @@ void hierarchical_binning::update_libf_data(data_store & libf_data, size_t const

size_t hierarchical_binning::add_lower_level(data_store & libf_data) const
{
size_t const number_of_user_bins = libf_data.positions.size();

// now do the binning for the low-level IBF:
if (libf_data.positions.size() > config.tmax)
if (number_of_user_bins > config.tmax)
{
// recursively call hierarchical binning if there are still too many UBs
return hierarchical_binning{libf_data, config}.execute(); // return id of maximum technical bin
}
else
{
// use simple binning to distribute remaining UBs
return simple_binning{libf_data, 0}.execute(); // return id of maximum technical bin
// Simple binning is not bound by config.tmax
size_t const num_user_bins = next_multiple_of_64(number_of_user_bins);
size_t const number_of_technical_bins = subtract_empty_bins(num_user_bins, config.empty_bin_fraction);
return simple_binning{libf_data, number_of_technical_bins}.execute(); // return id of maximum technical bin
}
}

Expand Down
58 changes: 57 additions & 1 deletion test/unit/hibf/config_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,15 @@ TEST(config_test, write_to)
std::string const expected_file{"@HIBF_CONFIG\n"
"@{\n"
"@ \"hibf_config\": {\n"
"@ \"version\": 1,\n"
"@ \"version\": 2,\n"
"@ \"number_of_user_bins\": 123456789,\n"
"@ \"number_of_hash_functions\": 4,\n"
"@ \"maximum_fpr\": 0.0001,\n"
"@ \"relaxed_fpr\": 0.3,\n"
"@ \"threads\": 31,\n"
"@ \"sketch_bits\": 8,\n"
"@ \"tmax\": 128,\n"
"@ \"empty_bin_fraction\": 0.0,\n"
"@ \"alpha\": 1.0,\n"
"@ \"max_rearrangement_ratio\": 0.333,\n"
"@ \"disable_estimate_union\": true,\n"
Expand All @@ -57,6 +58,45 @@ TEST(config_test, write_to)
}

TEST(config_test, read_from)
{
std::stringstream ss{"@HIBF_CONFIG\n"
"@{\n"
"@ \"hibf_config\": {\n"
"@ \"version\": 2,\n"
"@ \"number_of_user_bins\": 123456789,\n"
"@ \"number_of_hash_functions\": 4,\n"
"@ \"maximum_fpr\": 0.0001,\n"
"@ \"relaxed_fpr\": 0.3,\n"
"@ \"threads\": 31,\n"
"@ \"sketch_bits\": 8,\n"
"@ \"tmax\": 128,\n"
"@ \"empty_bin_fraction\": 0.5,\n"
"@ \"alpha\": 1.0,\n"
"@ \"max_rearrangement_ratio\": 0.333,\n"
"@ \"disable_estimate_union\": true,\n"
"@ \"disable_rearrangement\": false\n"
"@ }\n"
"@}\n"
"@HIBF_CONFIG_END\n"};

seqan::hibf::config configuration;
configuration.read_from(ss);

EXPECT_EQ(configuration.number_of_user_bins, 123456789);
EXPECT_EQ(configuration.number_of_hash_functions, 4);
EXPECT_EQ(configuration.maximum_fpr, 0.0001);
EXPECT_EQ(configuration.relaxed_fpr, 0.3);
EXPECT_EQ(configuration.threads, 31);
EXPECT_EQ(configuration.sketch_bits, 8);
EXPECT_EQ(configuration.tmax, 128);
EXPECT_EQ(configuration.empty_bin_fraction, 0.5);
EXPECT_EQ(configuration.alpha, 1.0);
EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333);
EXPECT_EQ(configuration.disable_estimate_union, true);
EXPECT_EQ(configuration.disable_rearrangement, false);
}

TEST(config_test, read_from_v1)
{
std::stringstream ss{"@HIBF_CONFIG\n"
"@{\n"
Expand Down Expand Up @@ -87,6 +127,7 @@ TEST(config_test, read_from)
EXPECT_EQ(configuration.threads, 31);
EXPECT_EQ(configuration.sketch_bits, 8);
EXPECT_EQ(configuration.tmax, 128);
EXPECT_EQ(configuration.empty_bin_fraction, 0.0);
EXPECT_EQ(configuration.alpha, 1.0);
EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333);
EXPECT_EQ(configuration.disable_estimate_union, true);
Expand Down Expand Up @@ -293,6 +334,21 @@ TEST(config_test, validate_and_set_defaults)
"increased your number of technical bins to 64.\n");
}

// empty_bin_fraction must be in [0.0,1.0)
{
seqan::hibf::config configuration{.input_fn = dummy_input_fn,
.number_of_user_bins = 1u,
.empty_bin_fraction = -0.1};
EXPECT_THROW_MSG(configuration.validate_and_set_defaults(),
std::invalid_argument,
"[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0).");

configuration.empty_bin_fraction = 1.0;
EXPECT_THROW_MSG(configuration.validate_and_set_defaults(),
std::invalid_argument,
"[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0).");
}

// alpha must be positive
{
seqan::hibf::config configuration{.input_fn = dummy_input_fn, .number_of_user_bins = 1u, .alpha = -0.1};
Expand Down
42 changes: 42 additions & 0 deletions test/unit/hibf/layout/hierarchical_binning_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,48 @@ TEST(hierarchical_binning_test, small_example)
EXPECT_RANGE_EQ(hibf_layout.user_bins, expected_user_bins);
}

TEST(hierarchical_binning_test, small_example_with_empty_bins)
{
seqan::hibf::config config;
config.tmax = 5;
config.disable_estimate_union = true; // also disables rearrangement
config.empty_bin_fraction = 0.001;

seqan::hibf::layout::layout hibf_layout{};
std::vector<size_t> kmer_counts{500, 1000, 500, 500, 500, 500, 500, 500};

seqan::hibf::layout::data_store data{.hibf_layout = &hibf_layout, .kmer_counts = &kmer_counts};

data.fpr_correction =
seqan::hibf::layout::compute_fpr_correction({.fpr = 0.05, .hash_count = 2, .t_max = config.tmax});
data.relaxed_fpr_correction =
seqan::hibf::layout::compute_relaxed_fpr_correction({.fpr = 0.05, .relaxed_fpr = 0.3, .hash_count = 2});

seqan::hibf::layout::hierarchical_binning algo{data, config};
EXPECT_EQ(algo.execute(), 3u); // #HIGH_LEVEL_IBF max_bin_id:3

// The results should almost be the same as in small_example
// The max bins might differ because of the empty bin fraction
// The layout structure should be the same, except split bins differing by +-1

std::vector<seqan::hibf::layout::layout::max_bin> expected_max_bins{{{2}, 0}, {{3}, 43}};

// clang-format off
std::vector<seqan::hibf::layout::layout::user_bin> expected_user_bins{{{}, 0, 1, 7},
{{}, 1, 1, 6},
{{2}, 0, 22 - 1, 3},
{{2}, 22 - 1, 21, 4},
{{2}, 43 - 1, 21, 5},
{{3}, 0, 42 + 1, 1},
{{3}, 42 + 1, 11 - 1, 0},
{{3}, 53, 11 - 1, 2}};

// clang-format on

EXPECT_RANGE_EQ(hibf_layout.max_bins, expected_max_bins);
EXPECT_RANGE_EQ(hibf_layout.user_bins, expected_user_bins);
}

TEST(hierarchical_binning_test, another_example)
{
seqan::hibf::config config;
Expand Down

0 comments on commit 7c2b62e

Please sign in to comment.