Skip to content

Commit

Permalink
[MISC] Separate kmer counting from sketches. (#217)
Browse files Browse the repository at this point in the history
* [MISC] Separate kmer counting from sketches.

* [MISC] automatic linting

* fix test

* [MISC] automatic linting

* fix doc

* fix benchmark

* review updates

* fix test

---------

Co-authored-by: seqan-actions[bot] <[email protected]>
  • Loading branch information
smehringer and seqan-actions authored Aug 12, 2024
1 parent 5c570cc commit ed7fc71
Show file tree
Hide file tree
Showing 6 changed files with 15 additions and 47 deletions.
11 changes: 1 addition & 10 deletions include/hibf/sketch/compute_sketches.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,12 @@ namespace seqan::hibf::sketch
/*!\brief Computes the kmer_counts and sketches and stores them in the respective vectors for further use.
* \ingroup hibf_layout
* \param[in] config The configuration to compute the layout with.
* \param[in,out] kmer_counts The vector that will store the kmer counts (estimations).
* \param[in,out] hll_sketches The vector that will store the sketches.
*/
void compute_sketches(config const & config,
std::vector<size_t> & kmer_counts,
std::vector<sketch::hyperloglog> & hll_sketches);

//!\overload
void compute_sketches(config const & config,
std::vector<sketch::hyperloglog> & hll_sketches,
std::vector<sketch::minhashes> & minhash_sketches);
void compute_sketches(config const & config, std::vector<sketch::hyperloglog> & hll_sketches);

//!\overload
void compute_sketches(config const & config,
std::vector<size_t> & kmer_counts,
std::vector<sketch::hyperloglog> & hll_sketches,
std::vector<sketch::minhashes> & minhash_sketches);

Expand Down
9 changes: 6 additions & 3 deletions src/hierarchical_interleaved_bloom_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@
#include <hibf/misc/iota_vector.hpp>
#include <hibf/misc/timer.hpp> // for concurrent_timer
#include <hibf/sketch/compute_sketches.hpp> // for compute_sketches
#include <hibf/sketch/hyperloglog.hpp> // for hyperloglog
#include <hibf/sketch/estimate_kmer_counts.hpp>
#include <hibf/sketch/hyperloglog.hpp> // for hyperloglog

namespace seqan::hibf
{
Expand Down Expand Up @@ -204,10 +205,12 @@ hierarchical_interleaved_bloom_filter::hierarchical_interleaved_bloom_filter(con
{
configuration.validate_and_set_defaults();

std::vector<size_t> kmer_counts{};
std::vector<sketch::hyperloglog> sketches{};
std::vector<size_t> kmer_counts{};

layout_compute_sketches_timer.start();
sketch::compute_sketches(configuration, kmer_counts, sketches);
sketch::compute_sketches(configuration, sketches);
hibf::sketch::estimate_kmer_counts(sketches, kmer_counts);
layout_compute_sketches_timer.stop();

// If rearrangement is enabled, i.e. seqan::hibf::config::disable_rearrangement is false:
Expand Down
17 changes: 1 addition & 16 deletions src/sketch/compute_sketches.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,10 @@
namespace seqan::hibf::sketch
{

void compute_sketches(config const & config,
std::vector<size_t> & kmer_counts,
std::vector<sketch::hyperloglog> & hll_sketches)
void compute_sketches(config const & config, std::vector<sketch::hyperloglog> & hll_sketches)
{
// compute hll_sketches
hll_sketches.resize(config.number_of_user_bins);
kmer_counts.resize(config.number_of_user_bins);

robin_hood::unordered_flat_set<uint64_t> kmers;
#pragma omp parallel for schedule(dynamic) num_threads(config.threads) private(kmers)
Expand All @@ -47,8 +44,6 @@ void compute_sketches(config const & config,

hll_sketches[i] = std::move(hll_sketch);
}

sketch::estimate_kmer_counts(hll_sketches, kmer_counts);
}

/*!\brief Encapsulates handling of too few kmers to compute minHash sketches.
Expand Down Expand Up @@ -164,14 +159,4 @@ void compute_sketches(config const & config,
too_few_kmers.check_and_throw();
}

void compute_sketches(config const & config,
std::vector<size_t> & kmer_counts,
std::vector<sketch::hyperloglog> & hll_sketches,
std::vector<sketch::minhashes> & minhash_sketches)
{
compute_sketches(config, hll_sketches, minhash_sketches);
kmer_counts.resize(config.number_of_user_bins);
sketch::estimate_kmer_counts(hll_sketches, kmer_counts);
}

} // namespace seqan::hibf::sketch
5 changes: 2 additions & 3 deletions test/performance/sketch/compute_sketches_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ void compute_sketches(benchmark::State & state)
it = i;
};

std::vector<size_t> kmer_counts;
[[maybe_unused]] std::vector<seqan::hibf::sketch::minhashes> minhash_sketches;
std::vector<seqan::hibf::sketch::hyperloglog> hyperloglog_sketches;

Expand All @@ -43,9 +42,9 @@ void compute_sketches(benchmark::State & state)
for (auto _ : state)
{
if constexpr (sketch_t == sketch::MinHashes)
seqan::hibf::sketch::compute_sketches(config, kmer_counts, hyperloglog_sketches, minhash_sketches);
seqan::hibf::sketch::compute_sketches(config, hyperloglog_sketches, minhash_sketches);
else
seqan::hibf::sketch::compute_sketches(config, kmer_counts, hyperloglog_sketches);
seqan::hibf::sketch::compute_sketches(config, hyperloglog_sketches);
}
}

Expand Down
6 changes: 4 additions & 2 deletions test/unit/hibf/layout/compute_layout_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
#include <hibf/misc/iota_vector.hpp>
#include <hibf/misc/timer.hpp> // for concurrent_timer
#include <hibf/sketch/compute_sketches.hpp> // for compute_sketches
#include <hibf/sketch/hyperloglog.hpp> // for hyperloglog
#include <hibf/sketch/estimate_kmer_counts.hpp>
#include <hibf/sketch/hyperloglog.hpp> // for hyperloglog

TEST(compute_layout, dispatch)
{
Expand All @@ -33,7 +34,8 @@ TEST(compute_layout, dispatch)
std::vector<seqan::hibf::sketch::hyperloglog> sketches;
std::vector<size_t> kmer_counts;

seqan::hibf::sketch::compute_sketches(config, kmer_counts, sketches);
seqan::hibf::sketch::compute_sketches(config, sketches);
seqan::hibf::sketch::estimate_kmer_counts(sketches, kmer_counts);

auto layout1 = seqan::hibf::layout::compute_layout(config, kmer_counts, sketches);

Expand Down
14 changes: 1 addition & 13 deletions test/unit/hibf/sketch/compute_sketches_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,8 @@ class compute_sketches_test : public ::testing::Test

TEST_F(compute_sketches_test, hyperloglog_and_kmer_counts)
{
seqan::hibf::sketch::compute_sketches(this->config, this->kmer_counts, this->hyperloglog_sketches);
seqan::hibf::sketch::compute_sketches(this->config, this->hyperloglog_sketches);

this->check_kmer_counts();
this->check_hyperloglog_sketches();
}

Expand All @@ -94,17 +93,6 @@ TEST_F(compute_sketches_test, with_minHash)
this->check_minhash_sketches();
}

TEST_F(compute_sketches_test, with_minHash_and_kmer_counts)
{
seqan::hibf::sketch::compute_sketches(this->config,
this->kmer_counts,
this->hyperloglog_sketches,
this->minhash_sketches);

this->check_hyperloglog_sketches();
this->check_minhash_sketches();
}

TEST_F(compute_sketches_test, too_few_hashes)
{
this->config.number_of_user_bins = 1;
Expand Down

0 comments on commit ed7fc71

Please sign in to comment.