From ed7fc7126375f350d886bb09daa14f26b2b9d356 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Mon, 12 Aug 2024 13:34:42 +0200 Subject: [PATCH] [MISC] Separate kmer counting from sketches. (#217) * [MISC] Separate kmer counting from sketches. * [MISC] automatic linting * fix test * [MISC] automatic linting * fix doc * fix benchmark * review updates * fix test --------- Co-authored-by: seqan-actions[bot] --- include/hibf/sketch/compute_sketches.hpp | 11 +---------- src/hierarchical_interleaved_bloom_filter.cpp | 9 ++++++--- src/sketch/compute_sketches.cpp | 17 +---------------- .../sketch/compute_sketches_benchmark.cpp | 5 ++--- test/unit/hibf/layout/compute_layout_test.cpp | 6 ++++-- test/unit/hibf/sketch/compute_sketches_test.cpp | 14 +------------- 6 files changed, 15 insertions(+), 47 deletions(-) diff --git a/include/hibf/sketch/compute_sketches.hpp b/include/hibf/sketch/compute_sketches.hpp index 65d2b747..3e8d972d 100644 --- a/include/hibf/sketch/compute_sketches.hpp +++ b/include/hibf/sketch/compute_sketches.hpp @@ -17,21 +17,12 @@ namespace seqan::hibf::sketch /*!\brief Computes the kmer_counts and sketches and stores them in the respective vectors for further use. * \ingroup hibf_layout * \param[in] config The configuration to compute the layout with. - * \param[in,out] kmer_counts The vector that will store the kmer counts (estimations). * \param[in,out] hll_sketches The vector that will store the sketches. */ -void compute_sketches(config const & config, - std::vector & kmer_counts, - std::vector & hll_sketches); - -//!\overload -void compute_sketches(config const & config, - std::vector & hll_sketches, - std::vector & minhash_sketches); +void compute_sketches(config const & config, std::vector & hll_sketches); //!\overload void compute_sketches(config const & config, - std::vector & kmer_counts, std::vector & hll_sketches, std::vector & minhash_sketches); diff --git a/src/hierarchical_interleaved_bloom_filter.cpp b/src/hierarchical_interleaved_bloom_filter.cpp index 29fa390b..d7aad4f0 100644 --- a/src/hierarchical_interleaved_bloom_filter.cpp +++ b/src/hierarchical_interleaved_bloom_filter.cpp @@ -32,7 +32,8 @@ #include #include // for concurrent_timer #include // for compute_sketches -#include // for hyperloglog +#include +#include // for hyperloglog namespace seqan::hibf { @@ -204,10 +205,12 @@ hierarchical_interleaved_bloom_filter::hierarchical_interleaved_bloom_filter(con { configuration.validate_and_set_defaults(); - std::vector kmer_counts{}; std::vector sketches{}; + std::vector kmer_counts{}; + layout_compute_sketches_timer.start(); - sketch::compute_sketches(configuration, kmer_counts, sketches); + sketch::compute_sketches(configuration, sketches); + hibf::sketch::estimate_kmer_counts(sketches, kmer_counts); layout_compute_sketches_timer.stop(); // If rearrangement is enabled, i.e. seqan::hibf::config::disable_rearrangement is false: diff --git a/src/sketch/compute_sketches.cpp b/src/sketch/compute_sketches.cpp index aade611c..b79f94b8 100644 --- a/src/sketch/compute_sketches.cpp +++ b/src/sketch/compute_sketches.cpp @@ -25,13 +25,10 @@ namespace seqan::hibf::sketch { -void compute_sketches(config const & config, - std::vector & kmer_counts, - std::vector & hll_sketches) +void compute_sketches(config const & config, std::vector & hll_sketches) { // compute hll_sketches hll_sketches.resize(config.number_of_user_bins); - kmer_counts.resize(config.number_of_user_bins); robin_hood::unordered_flat_set kmers; #pragma omp parallel for schedule(dynamic) num_threads(config.threads) private(kmers) @@ -47,8 +44,6 @@ void compute_sketches(config const & config, hll_sketches[i] = std::move(hll_sketch); } - - sketch::estimate_kmer_counts(hll_sketches, kmer_counts); } /*!\brief Encapsulates handling of too few kmers to compute minHash sketches. @@ -164,14 +159,4 @@ void compute_sketches(config const & config, too_few_kmers.check_and_throw(); } -void compute_sketches(config const & config, - std::vector & kmer_counts, - std::vector & hll_sketches, - std::vector & minhash_sketches) -{ - compute_sketches(config, hll_sketches, minhash_sketches); - kmer_counts.resize(config.number_of_user_bins); - sketch::estimate_kmer_counts(hll_sketches, kmer_counts); -} - } // namespace seqan::hibf::sketch diff --git a/test/performance/sketch/compute_sketches_benchmark.cpp b/test/performance/sketch/compute_sketches_benchmark.cpp index 5260e7de..7e6c2358 100644 --- a/test/performance/sketch/compute_sketches_benchmark.cpp +++ b/test/performance/sketch/compute_sketches_benchmark.cpp @@ -31,7 +31,6 @@ void compute_sketches(benchmark::State & state) it = i; }; - std::vector kmer_counts; [[maybe_unused]] std::vector minhash_sketches; std::vector hyperloglog_sketches; @@ -43,9 +42,9 @@ void compute_sketches(benchmark::State & state) for (auto _ : state) { if constexpr (sketch_t == sketch::MinHashes) - seqan::hibf::sketch::compute_sketches(config, kmer_counts, hyperloglog_sketches, minhash_sketches); + seqan::hibf::sketch::compute_sketches(config, hyperloglog_sketches, minhash_sketches); else - seqan::hibf::sketch::compute_sketches(config, kmer_counts, hyperloglog_sketches); + seqan::hibf::sketch::compute_sketches(config, hyperloglog_sketches); } } diff --git a/test/unit/hibf/layout/compute_layout_test.cpp b/test/unit/hibf/layout/compute_layout_test.cpp index d1f439a5..17a2c9cf 100644 --- a/test/unit/hibf/layout/compute_layout_test.cpp +++ b/test/unit/hibf/layout/compute_layout_test.cpp @@ -14,7 +14,8 @@ #include #include // for concurrent_timer #include // for compute_sketches -#include // for hyperloglog +#include +#include // for hyperloglog TEST(compute_layout, dispatch) { @@ -33,7 +34,8 @@ TEST(compute_layout, dispatch) std::vector sketches; std::vector kmer_counts; - seqan::hibf::sketch::compute_sketches(config, kmer_counts, sketches); + seqan::hibf::sketch::compute_sketches(config, sketches); + seqan::hibf::sketch::estimate_kmer_counts(sketches, kmer_counts); auto layout1 = seqan::hibf::layout::compute_layout(config, kmer_counts, sketches); diff --git a/test/unit/hibf/sketch/compute_sketches_test.cpp b/test/unit/hibf/sketch/compute_sketches_test.cpp index c5341481..065362c4 100644 --- a/test/unit/hibf/sketch/compute_sketches_test.cpp +++ b/test/unit/hibf/sketch/compute_sketches_test.cpp @@ -80,9 +80,8 @@ class compute_sketches_test : public ::testing::Test TEST_F(compute_sketches_test, hyperloglog_and_kmer_counts) { - seqan::hibf::sketch::compute_sketches(this->config, this->kmer_counts, this->hyperloglog_sketches); + seqan::hibf::sketch::compute_sketches(this->config, this->hyperloglog_sketches); - this->check_kmer_counts(); this->check_hyperloglog_sketches(); } @@ -94,17 +93,6 @@ TEST_F(compute_sketches_test, with_minHash) this->check_minhash_sketches(); } -TEST_F(compute_sketches_test, with_minHash_and_kmer_counts) -{ - seqan::hibf::sketch::compute_sketches(this->config, - this->kmer_counts, - this->hyperloglog_sketches, - this->minhash_sketches); - - this->check_hyperloglog_sketches(); - this->check_minhash_sketches(); -} - TEST_F(compute_sketches_test, too_few_hashes) { this->config.number_of_user_bins = 1;