From dfa82769704ed69430f9964007c30f5476c39966 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Fri, 20 Oct 2023 14:34:38 +0200 Subject: [PATCH] [MISC] Review --- include/hibf/config.hpp | 21 ++++++++++------- include/hibf/layout/hierarchical_binning.hpp | 12 +++++----- src/build/construct_ibf.cpp | 4 ++-- src/config.cpp | 8 +++++++ test/unit/hibf/config_test.cpp | 24 ++++++++++++++++++++ 5 files changed, 53 insertions(+), 16 deletions(-) diff --git a/include/hibf/config.hpp b/include/hibf/config.hpp index 45bc67e2..fb702ff4 100644 --- a/include/hibf/config.hpp +++ b/include/hibf/config.hpp @@ -151,19 +151,21 @@ struct config /*!\brief Merged bins can be allowed to have a higher FPR than split bins. * - * Merged bins in an HIBF layout will always be followed by one or more lower level IBFs that will have split bins + * Merged bins in an HIBF layout will always be followed by one or more lower-level IBFs that will have split bins * or single bins (split = 1) to recover the original user bins. Thus, the FPR of merged bins does not determine the - * maximum_false_positive_rate but is independent. Choosing a higher FPR for merged bins can lower the memory - * requirement but increases the runtime. Experiments show that the decrease in memory is significant while the - * the runtime suffers only slightly. The accuracy of the results is not affected by this parameter. + * seqan::hibf::config::maximum_false_positive_rate, but is independent. Choosing a higher FPR for merged bins can + * lower the memory requirement but increases the runtime. Experiments show that the decrease in memory is + * significant, while the runtime suffers only slightly. The accuracy of the results is not affected by this + * parameter. * * Value must be in range [0,1]. - * Value must larger than maximum_false_positive_rate. + * Value must be equal to or larger than seqan::hibf::config::maximum_false_positive_rate. * Recommendation: default value (0.3) * - * Note: For each IBF there is a limit of how high the FPR of merged bins can be. Namely, the FPR for merged bins - * can never decrease the IBF size more than what is needed to ensure the maximum_false_positive_rate for split - * bins. This means that choosing even higher values for this parameter, at some point, has no effect anymore. + * Note: For each IBF there is a limit to how high the FPR of merged bins can be. Specifically, the FPR for merged + * bins can never decrease the IBF size more than what is needed to ensure the + * seqan::hibf::config::maximum_false_positive_rate for split bins. This means that, at some point, choosing even + * higher values for this parameter will have no effect anymore. * * \sa [Bloom Filter Calculator](https://hur.st/bloomfilter/). */ @@ -286,6 +288,9 @@ struct config * Constrains: * * seqan::hibf::config::number_of_hash_functions must be in `[1,5]`. * * seqan::hibf::config::maximum_false_positive_rate must be in `[0.0,1.0]`. + * * seqan::hibf::config::allowed_merged_bin_false_positive_rate must be in `[0.0,1.0]`. + * * seqan::hibf::config::allowed_merged_bin_false_positive_rate must be equal to or larger than + * seqan::hibf::config::maximum_false_positive_rate. * * seqan::hibf::config::threads must be greater than `0`. * * seqan::hibf::config::sketch_bits must be in `[5,32]`. * * seqan::hibf::config::tmax must be at most `18446744073709551552`. diff --git a/include/hibf/layout/hierarchical_binning.hpp b/include/hibf/layout/hierarchical_binning.hpp index 7225199f..36d0b937 100644 --- a/include/hibf/layout/hierarchical_binning.hpp +++ b/include/hibf/layout/hierarchical_binning.hpp @@ -32,14 +32,14 @@ class hierarchical_binning //!\brief The number of technical bins requested by the user. size_t num_technical_bins{}; - // this struct simplifies passing the parameters needed for tracking the maxium technical bin + //!\brief Simplifies passing the parameters needed for tracking the maximum technical bin. struct top_level_max_tracker { - size_t max_id{}; // the id of the technical bin with maximal size - size_t max_size{}; // the maximum technical bin size seen so far - size_t max_split_id{}; // the id of the split bin with maximal size (if any) - size_t max_split_size{}; // the maximum split bin size seen so far - bool there_is_a_split_bin{false}; + size_t max_id{}; //!< The ID of the technical bin with maximal size. + size_t max_size{}; //!< The maximum technical bin size seen so far. + size_t max_split_id{}; //!< The ID of the split bin with maximal size (if any). + size_t max_split_size{}; //!< The maximum split bin size seen so far. + bool there_is_a_split_bin{false}; //!< Whether the maximum bin was a split bin at any point. }; public: diff --git a/src/build/construct_ibf.cpp b/src/build/construct_ibf.cpp index cd594861..0358222b 100644 --- a/src/build/construct_ibf.cpp +++ b/src/build/construct_ibf.cpp @@ -29,8 +29,8 @@ seqan::hibf::interleaved_bloom_filter construct_ibf(robin_hood::unordered_flat_s bool is_root) { size_t const kmers_per_bin{static_cast(std::ceil(static_cast(kmers.size()) / number_of_bins))}; - double const fpr = (ibf_node.children.empty() /* split */) ? data.config.maximum_false_positive_rate - : data.config.allowed_merged_bin_false_positive_rate; + double const fpr = (ibf_node.children.empty() /* is_split */) ? data.config.maximum_false_positive_rate + : data.config.allowed_merged_bin_false_positive_rate; double const bin_bits{static_cast( bin_size_in_bits({.fpr = fpr, .hash_count = data.config.number_of_hash_functions, .elements = kmers_per_bin}))}; seqan::hibf::bin_size const bin_size{ diff --git a/src/config.cpp b/src/config.cpp index c7c90007..01b5fd91 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -75,6 +75,14 @@ void config::validate_and_set_defaults() if (maximum_false_positive_rate < 0.0 || maximum_false_positive_rate > 1.0) throw std::invalid_argument{"[HIBF CONFIG ERROR] config::maximum_false_positive_rate must be in [0.0,1.0]."}; + if (allowed_merged_bin_false_positive_rate < 0.0 || allowed_merged_bin_false_positive_rate > 1.0) + throw std::invalid_argument{ + "[HIBF CONFIG ERROR] config::allowed_merged_bin_false_positive_rate must be in [0.0,1.0]."}; + + if (allowed_merged_bin_false_positive_rate < maximum_false_positive_rate) + throw std::invalid_argument{"[HIBF CONFIG ERROR] config::allowed_merged_bin_false_positive_rate must be " + "greater than or equal to config::maximum_false_positive_rate."}; + if (threads == 0u) throw std::invalid_argument{"[HIBF CONFIG ERROR] config::threads must be greater than 0."}; diff --git a/test/unit/hibf/config_test.cpp b/test/unit/hibf/config_test.cpp index 2bc1f215..4759e04f 100644 --- a/test/unit/hibf/config_test.cpp +++ b/test/unit/hibf/config_test.cpp @@ -191,6 +191,30 @@ TEST(config_test, validate_and_set_defaults) "[HIBF CONFIG ERROR] config::maximum_false_positive_rate must be in [0.0,1.0]."); } + // allowed_merged_bin_false_positive_rate must be in [0.0,1.0] + { + seqan::hibf::config configuration{.input_fn = dummy_input_fn, + .number_of_user_bins = 1u, + .allowed_merged_bin_false_positive_rate = -0.1}; + check_error_message(configuration, + "[HIBF CONFIG ERROR] config::allowed_merged_bin_false_positive_rate must be in [0.0,1.0]."); + + configuration.allowed_merged_bin_false_positive_rate = 1.1; + check_error_message(configuration, + "[HIBF CONFIG ERROR] config::allowed_merged_bin_false_positive_rate must be in [0.0,1.0]."); + } + + // allowed_merged_bin_false_positive_rate must equal to or greater than maximum_false_positive_rate + { + seqan::hibf::config configuration{.input_fn = dummy_input_fn, + .number_of_user_bins = 1u, + .maximum_false_positive_rate = 0.3, + .allowed_merged_bin_false_positive_rate = 0.2}; + check_error_message(configuration, + "[HIBF CONFIG ERROR] config::allowed_merged_bin_false_positive_rate must be " + "greater than or equal to config::maximum_false_positive_rate."); + } + // threads cannot be 0 { seqan::hibf::config configuration{.input_fn = dummy_input_fn, .number_of_user_bins = 1u, .threads = 0u};