Skip to content

Commit

Permalink
[MISC] Review
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Oct 20, 2023
1 parent 6b7344d commit dfa8276
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 16 deletions.
21 changes: 13 additions & 8 deletions include/hibf/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,19 +151,21 @@ struct config

/*!\brief Merged bins can be allowed to have a higher FPR than split bins.
*
* Merged bins in an HIBF layout will always be followed by one or more lower level IBFs that will have split bins
* Merged bins in an HIBF layout will always be followed by one or more lower-level IBFs that will have split bins
* or single bins (split = 1) to recover the original user bins. Thus, the FPR of merged bins does not determine the
* maximum_false_positive_rate but is independent. Choosing a higher FPR for merged bins can lower the memory
* requirement but increases the runtime. Experiments show that the decrease in memory is significant while the
* the runtime suffers only slightly. The accuracy of the results is not affected by this parameter.
* seqan::hibf::config::maximum_false_positive_rate, but is independent. Choosing a higher FPR for merged bins can
* lower the memory requirement but increases the runtime. Experiments show that the decrease in memory is
* significant, while the runtime suffers only slightly. The accuracy of the results is not affected by this
* parameter.
*
* Value must be in range [0,1].
* Value must larger than maximum_false_positive_rate.
* Value must be equal to or larger than seqan::hibf::config::maximum_false_positive_rate.
* Recommendation: default value (0.3)
*
* Note: For each IBF there is a limit of how high the FPR of merged bins can be. Namely, the FPR for merged bins
* can never decrease the IBF size more than what is needed to ensure the maximum_false_positive_rate for split
* bins. This means that choosing even higher values for this parameter, at some point, has no effect anymore.
* Note: For each IBF there is a limit to how high the FPR of merged bins can be. Specifically, the FPR for merged
* bins can never decrease the IBF size more than what is needed to ensure the
* seqan::hibf::config::maximum_false_positive_rate for split bins. This means that, at some point, choosing even
* higher values for this parameter will have no effect anymore.
*
* \sa [Bloom Filter Calculator](https://hur.st/bloomfilter/).
*/
Expand Down Expand Up @@ -286,6 +288,9 @@ struct config
* Constrains:
* * seqan::hibf::config::number_of_hash_functions must be in `[1,5]`.
* * seqan::hibf::config::maximum_false_positive_rate must be in `[0.0,1.0]`.
* * seqan::hibf::config::allowed_merged_bin_false_positive_rate must be in `[0.0,1.0]`.
* * seqan::hibf::config::allowed_merged_bin_false_positive_rate must be equal to or larger than
* seqan::hibf::config::maximum_false_positive_rate.
* * seqan::hibf::config::threads must be greater than `0`.
* * seqan::hibf::config::sketch_bits must be in `[5,32]`.
* * seqan::hibf::config::tmax must be at most `18446744073709551552`.
Expand Down
12 changes: 6 additions & 6 deletions include/hibf/layout/hierarchical_binning.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,14 @@ class hierarchical_binning
//!\brief The number of technical bins requested by the user.
size_t num_technical_bins{};

// this struct simplifies passing the parameters needed for tracking the maxium technical bin
//!\brief Simplifies passing the parameters needed for tracking the maximum technical bin.
struct top_level_max_tracker
{
size_t max_id{}; // the id of the technical bin with maximal size
size_t max_size{}; // the maximum technical bin size seen so far
size_t max_split_id{}; // the id of the split bin with maximal size (if any)
size_t max_split_size{}; // the maximum split bin size seen so far
bool there_is_a_split_bin{false};
size_t max_id{}; //!< The ID of the technical bin with maximal size.
size_t max_size{}; //!< The maximum technical bin size seen so far.
size_t max_split_id{}; //!< The ID of the split bin with maximal size (if any).
size_t max_split_size{}; //!< The maximum split bin size seen so far.
bool there_is_a_split_bin{false}; //!< Whether the maximum bin was a split bin at any point.
};

public:
Expand Down
4 changes: 2 additions & 2 deletions src/build/construct_ibf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ seqan::hibf::interleaved_bloom_filter construct_ibf(robin_hood::unordered_flat_s
bool is_root)
{
size_t const kmers_per_bin{static_cast<size_t>(std::ceil(static_cast<double>(kmers.size()) / number_of_bins))};
double const fpr = (ibf_node.children.empty() /* split */) ? data.config.maximum_false_positive_rate
: data.config.allowed_merged_bin_false_positive_rate;
double const fpr = (ibf_node.children.empty() /* is_split */) ? data.config.maximum_false_positive_rate
: data.config.allowed_merged_bin_false_positive_rate;
double const bin_bits{static_cast<double>(
bin_size_in_bits({.fpr = fpr, .hash_count = data.config.number_of_hash_functions, .elements = kmers_per_bin}))};
seqan::hibf::bin_size const bin_size{
Expand Down
8 changes: 8 additions & 0 deletions src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,14 @@ void config::validate_and_set_defaults()
if (maximum_false_positive_rate < 0.0 || maximum_false_positive_rate > 1.0)
throw std::invalid_argument{"[HIBF CONFIG ERROR] config::maximum_false_positive_rate must be in [0.0,1.0]."};

if (allowed_merged_bin_false_positive_rate < 0.0 || allowed_merged_bin_false_positive_rate > 1.0)
throw std::invalid_argument{
"[HIBF CONFIG ERROR] config::allowed_merged_bin_false_positive_rate must be in [0.0,1.0]."};

if (allowed_merged_bin_false_positive_rate < maximum_false_positive_rate)
throw std::invalid_argument{"[HIBF CONFIG ERROR] config::allowed_merged_bin_false_positive_rate must be "
"greater than or equal to config::maximum_false_positive_rate."};

if (threads == 0u)
throw std::invalid_argument{"[HIBF CONFIG ERROR] config::threads must be greater than 0."};

Expand Down
24 changes: 24 additions & 0 deletions test/unit/hibf/config_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,30 @@ TEST(config_test, validate_and_set_defaults)
"[HIBF CONFIG ERROR] config::maximum_false_positive_rate must be in [0.0,1.0].");
}

// allowed_merged_bin_false_positive_rate must be in [0.0,1.0]
{
seqan::hibf::config configuration{.input_fn = dummy_input_fn,
.number_of_user_bins = 1u,
.allowed_merged_bin_false_positive_rate = -0.1};
check_error_message(configuration,
"[HIBF CONFIG ERROR] config::allowed_merged_bin_false_positive_rate must be in [0.0,1.0].");

configuration.allowed_merged_bin_false_positive_rate = 1.1;
check_error_message(configuration,
"[HIBF CONFIG ERROR] config::allowed_merged_bin_false_positive_rate must be in [0.0,1.0].");
}

// allowed_merged_bin_false_positive_rate must equal to or greater than maximum_false_positive_rate
{
seqan::hibf::config configuration{.input_fn = dummy_input_fn,
.number_of_user_bins = 1u,
.maximum_false_positive_rate = 0.3,
.allowed_merged_bin_false_positive_rate = 0.2};
check_error_message(configuration,
"[HIBF CONFIG ERROR] config::allowed_merged_bin_false_positive_rate must be "
"greater than or equal to config::maximum_false_positive_rate.");
}

// threads cannot be 0
{
seqan::hibf::config configuration{.input_fn = dummy_input_fn, .number_of_user_bins = 1u, .threads = 0u};
Expand Down

0 comments on commit dfa8276

Please sign in to comment.