From ad0b2a112052c5b768b40fa9f6f9261f29ea2db3 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Tue, 8 Oct 2024 20:01:42 +0200 Subject: [PATCH 1/3] [MISC] Use floats for estimation --- src/estimate.cpp | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/src/estimate.cpp b/src/estimate.cpp index da526b3..00208dc 100644 --- a/src/estimate.cpp +++ b/src/estimate.cpp @@ -30,7 +30,7 @@ void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector & estimations_i, seqan3::dna4_vector const & seq, - std::vector & prev_counts, + std::vector & prev_counts, exp_t const & expressions, uint16_t const level, std::vector const fprs, @@ -40,7 +40,7 @@ void check_ibf(min_arguments const & args, static constexpr bool multiple_expressions = std::same_as>>; // Count minimisers in ibf of current level - std::vector counter(ibf.bin_count()); + std::vector counter(ibf.bin_count()); uint64_t minimiser_count{}; auto agent = ibf.membership_agent(); for (auto minHash : seqan3::views::minimiser_hash(seq, args.shape, args.w_size, args.s)) @@ -50,7 +50,7 @@ void check_ibf(min_arguments const & args, } // Defines where the median should be - double const minimiser_pos = minimiser_count / 2.0; + float const minimiser_pos = minimiser_count / 2.0; // Check every experiment by going over the number of bins in the ibf. for (size_t bin = 0; bin < counter.size(); ++bin) @@ -59,14 +59,13 @@ void check_ibf(min_arguments const & args, continue; // Correction by substracting the expected number of false positives - double const expected_false_positives = minimiser_count * fprs[bin]; - double const corrected_count = counter[bin] - expected_false_positives; - double const normalized_count = corrected_count / (1.0 - fprs[bin]); - - // TODO: Rounding? - // `normalized_count` could be ceiled - // `estimate` further down could be ceiled - counter[bin] = normalized_count < 0.0 ? 0u : normalized_count; + counter[bin] = [&]() + { + float const expected_false_positives = minimiser_count * fprs[bin]; + float const corrected_count = counter[bin] - expected_false_positives; + float const normalized_count = std::max(0.0, corrected_count / (1.0 - fprs[bin])); + return normalized_count; + }(); // Check if considering previously seen minimisers and minimisers found at current level equal to or are greater // than the minimiser_pow, which gives the median position. @@ -86,8 +85,8 @@ void check_ibf(min_arguments const & args, assert(minimiser_pos > prev_counts[bin]); // Should never fail. This would mean that prev_counts[bin] was enough by itself and we should already // have estimated on the previous level. - assert(counter[bin] > 0u); - double const normalized_minimiser_pos = (minimiser_pos - prev_counts[bin]) / counter[bin]; + assert(counter[bin] > 0.0); + float const normalized_minimiser_pos = (minimiser_pos - prev_counts[bin]) / counter[bin]; // Actually calculate estimation, in the else case level stands for the prev_expression if constexpr (multiple_expressions) @@ -139,9 +138,9 @@ void estimate(estimate_ibf_arguments & args, { std::vector ids; std::vector seqs; - std::vector counter; + std::vector counter; std::vector counter_est; - std::vector> prev_counts; + std::vector> prev_counts; uint64_t prev_expression; std::vector> estimations; std::vector> expressions; From 7900e3d059b7f69baf7dc1c5ba5f11e43c201995 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Tue, 8 Oct 2024 20:02:01 +0200 Subject: [PATCH 2/3] [MISC] Ceil when computing bin size --- src/ibf.cpp | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/ibf.cpp b/src/ibf.cpp index bb3483d..cc49d5a 100644 --- a/src/ibf.cpp +++ b/src/ibf.cpp @@ -813,6 +813,13 @@ void ibf_helper(std::vector const & minimiser_files, // ^^ why divide? --> estimate the number of minimisers } + auto divide_and_ceil = [](uint64_t const dividend, uint64_t const divisor) -> uint64_t + { + assert(dividend >= 1u || divisor >= 1u); + assert(divisor != 0u); + return (dividend + divisor - 1u) / divisor; + }; + // If set_expression_thresholds_samplewise is not set the expressions as determined by the first file are used for // all files. if constexpr (samplewise) @@ -821,9 +828,9 @@ void ibf_helper(std::vector const & minimiser_files, for (int c = 0; c < ibf_args.number_expression_thresholds - 1; c++) { diff = diff * 2; - sizes[i].push_back(filesize / diff); + sizes[i].push_back(divide_and_ceil(filesize, diff)); } - sizes[i].push_back(filesize / diff); + sizes[i].push_back(divide_and_ceil(filesize, diff)); } else if constexpr (minimiser_files_given) { @@ -839,10 +846,10 @@ void ibf_helper(std::vector const & minimiser_files, float diff{1}; for (int c = 0; c < ibf_args.number_expression_thresholds - 1; c++) { - diff = ibf_args.expression_thresholds[c + 1] / ibf_args.expression_thresholds[c]; - sizes[i].push_back(filesize / diff); + diff = ibf_args.expression_thresholds[c + 1] / static_cast(ibf_args.expression_thresholds[c]); + sizes[i].push_back(std::ceil(filesize / diff)); } - sizes[i].push_back(filesize / diff); + sizes[i].push_back(std::ceil(filesize / diff)); } } @@ -857,7 +864,7 @@ void ibf_helper(std::vector const & minimiser_files, size += sizes[i][j]; } - if (size == 0u) + if (size == num_files) { throw std::invalid_argument{ std::string("[Error]. The chosen expression threshold is not well picked. If you use the automatic ") From f78ba23be2869fb1bd5268217f0428f64d8cca19 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Tue, 8 Oct 2024 20:02:10 +0200 Subject: [PATCH 3/3] [MISC] Adjust tests --- test/api/estimate_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/api/estimate_test.cpp b/test/api/estimate_test.cpp index 4a11614..541dbb4 100644 --- a/test/api/estimate_test.cpp +++ b/test/api/estimate_test.cpp @@ -252,7 +252,7 @@ TEST_F(estimate_test, example) std::ifstream output_file("Single_expression.out"); std::string line; - std::string expected{"GeneA\t9\t32\t"}; + std::string expected{"GeneA\t10\t32\t"}; if (output_file.is_open()) { while (std::getline(output_file, line)) @@ -287,7 +287,7 @@ TEST_F(estimate_test, example_multiple_threads) std::ifstream output_file("Multiple_expression.out"); std::string line; - std::string expected{"GeneA\t9\t32\t"}; + std::string expected{"GeneA\t10\t32\t"}; if (output_file.is_open()) { while (std::getline(output_file, line))