From 82e9039268622e7063cb05d886de7e9a81575956 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Fri, 12 Jul 2024 17:49:10 +0200 Subject: [PATCH] review --- include/hibf/sketch/minHashes.hpp | 67 +++++---- test/include/hibf/test/expect_range_eq.hpp | 2 +- test/performance/sketch/CMakeLists.txt | 1 + .../sketch/compute_sketches_benchmark.cpp | 49 +++++++ .../hibf/sketch/compute_sketches_test.cpp | 131 +++++++++--------- test/unit/hibf/sketch/minHashes_test.cpp | 83 ++++++----- 6 files changed, 191 insertions(+), 142 deletions(-) create mode 100644 test/performance/sketch/compute_sketches_benchmark.cpp diff --git a/include/hibf/sketch/minHashes.hpp b/include/hibf/sketch/minHashes.hpp index 07981535..64f16500 100644 --- a/include/hibf/sketch/minHashes.hpp +++ b/include/hibf/sketch/minHashes.hpp @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: 2006-2023, Knut Reinert & Freie Universität Berlin -// SPDX-FileCopyrightText: 2016-2023, Knut Reinert & MPI für molekulare Genetik +// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik // SPDX-License-Identifier: BSD-3-Clause /*!\file @@ -7,6 +7,7 @@ * \brief Provides seqan::hibf::sketch::minHashes. */ +#include #include #include @@ -36,26 +37,24 @@ struct minHashes ~minHashes() = default; //!< Defaulted. //!\brief construct from a vector of the smallest values in a set (sorted ascending). - minHashes(std::vector const & smalles_values) + minHashes(std::vector const & smallest_values) { table.resize(num_sketches); - for (size_t i = 0; i < num_sketches; ++i) - { - table[i].reserve(sketch_size); - } + for (auto & elem : table) + elem.reserve(sketch_size); - for (uint64_t const hash : smalles_values) + for (uint64_t const hash : smallest_values) { - uint64_t const register_id{hash & register_id_mask}; - if (table[register_id].size() < sketch_size) - table[register_id].push_back(hash >> 4); + auto & hash_table = table[hash & register_id_mask]; + if (hash_table.size() < sketch_size) + hash_table.push_back(hash >> 4); } } - static inline uint64_t const register_id_mask{15}; /// ...00001111 - static inline size_t const num_sketches{16}; - static inline size_t const sketch_size{40}; + static constexpr uint64_t register_id_mask{15}; /// ...00001111 + static constexpr size_t num_sketches{16}; + static constexpr size_t sketch_size{40}; //!\brief A table of sketches. For LSH we need multiple sketches, stored in a table. std::vector> table{}; @@ -63,37 +62,37 @@ struct minHashes //!\brief Checks whether the minHash table is completely filled bool is_valid() const { - bool result{table.size() == num_sketches}; - for (auto const & minHash_sketch : table) - { - if (minHash_sketch.size() < sketch_size) - result = false; - } - return result; + return table.size() == num_sketches + && std::ranges::all_of(table, + [](auto const & minHash_sketch) + { + return minHash_sketch.size() == sketch_size; + }); } //!\brief Adds more minhash values to an existing but incomplete table. - void fill_up_sketches(std::span const & more_smalles_values) + void fill_up_sketches(std::span const & more_smallest_values) { - for (uint64_t const hash : more_smalles_values) + for (uint64_t const hash : more_smallest_values) { - uint64_t const register_id{hash & register_id_mask}; - assert(table[register_id].find(hash) == table[register_id].end()); // hashes should be unique - if (table[register_id].size() < sketch_size) - table[register_id].push_back(hash >> 4); + auto & hash_table = table[hash & register_id_mask]; + assert(std::ranges::find(hash_table, hash) == hash_table.end()); // hashes should be unique + if (hash_table.size() < sketch_size) + hash_table.push_back(hash >> 4); } } //!\brief Miscallenious function to update a heap with a new element, when preparing a heap for minHash sketches static void update_heap_with(std::vector & heap, uint64_t const k_hash) { - if (k_hash < heap[0]) // 0 because it is a max heap - { - // we do not need a guard (hash table) to check for duplications because `kmers` is already a set - std::pop_heap(heap.begin(), heap.end()); // max elements move to end of vector - heap.back() = k_hash; // replace last elements instead of physically popping and pushing - std::push_heap(heap.begin(), heap.end()); // last elements is rearranged in the heap to be pushed - } + // Do nothing if k_hash is bigger than the current biggest element in the (max) heap. + if (k_hash >= heap[0]) + return; + + // we do not need a guard (hash table) to check for duplications because `kmers` is already a set + std::ranges::pop_heap(heap); // max elements move to end of vector + heap.back() = k_hash; // replace last elements instead of physically popping and pushing + std::ranges::push_heap(heap); // last elements is rearranged in the heap to be pushed } }; diff --git a/test/include/hibf/test/expect_range_eq.hpp b/test/include/hibf/test/expect_range_eq.hpp index 13753b0c..91444268 100644 --- a/test/include/hibf/test/expect_range_eq.hpp +++ b/test/include/hibf/test/expect_range_eq.hpp @@ -42,7 +42,7 @@ void PrintTo(t const & value, std::ostream * out) namespace seqan::hibf::test { -#define EXPECT_RANGE_EQ(val1, val2) EXPECT_PRED_FORMAT2(::seqan::hibf::test::expect_range_eq{}, val1, val2); +#define EXPECT_RANGE_EQ(val1, val2) EXPECT_PRED_FORMAT2(::seqan::hibf::test::expect_range_eq{}, val1, val2) struct expect_range_eq { diff --git a/test/performance/sketch/CMakeLists.txt b/test/performance/sketch/CMakeLists.txt index 3e40ea3b..abe6afbe 100644 --- a/test/performance/sketch/CMakeLists.txt +++ b/test/performance/sketch/CMakeLists.txt @@ -2,4 +2,5 @@ # SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik # SPDX-License-Identifier: BSD-3-Clause +hibf_benchmark (compute_sketches_benchmark.cpp) hibf_benchmark (hyperloglog_benchmark.cpp) diff --git a/test/performance/sketch/compute_sketches_benchmark.cpp b/test/performance/sketch/compute_sketches_benchmark.cpp new file mode 100644 index 00000000..fda6a741 --- /dev/null +++ b/test/performance/sketch/compute_sketches_benchmark.cpp @@ -0,0 +1,49 @@ +// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#include + +#include + +enum class sketch : uint8_t +{ + Hyperloglog, + MinHashes +}; + +template +void compute_sketches(benchmark::State & state) +{ + auto create_hashes = [&](size_t const ub_id, seqan::hibf::insert_iterator it) + { + // 0 = [0, 10000] + // 1 = [10000, 20000] + // 1 = [20000, 30000] + for (size_t i = ub_id * 10000; i < (ub_id + 1) * 10000; ++i) + it = i; + }; + + std::vector kmer_counts; + [[maybe_unused]] std::vector minHash_sketches; + std::vector hyperloglog_sketches; + + seqan::hibf::config config{}; + config.number_of_user_bins = 16; + config.input_fn = create_hashes; + config.sketch_bits = 12; + + for (auto _ : state) + { + if constexpr (sketch_t == sketch::MinHashes) + seqan::hibf::sketch::compute_sketches_with_minhash(config, + kmer_counts, + hyperloglog_sketches, + minHash_sketches); + else + seqan::hibf::sketch::compute_sketches(config, kmer_counts, hyperloglog_sketches); + } +} + +BENCHMARK_TEMPLATE(compute_sketches, sketch::Hyperloglog); +BENCHMARK_TEMPLATE(compute_sketches, sketch::MinHashes); diff --git a/test/unit/hibf/sketch/compute_sketches_test.cpp b/test/unit/hibf/sketch/compute_sketches_test.cpp index 9d26419d..bb79c41e 100644 --- a/test/unit/hibf/sketch/compute_sketches_test.cpp +++ b/test/unit/hibf/sketch/compute_sketches_test.cpp @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: 2006-2023, Knut Reinert & Freie Universität Berlin -// SPDX-FileCopyrightText: 2016-2023, Knut Reinert & MPI für molekulare Genetik +// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik // SPDX-License-Identifier: BSD-3-Clause #include // for Test, Message, TestPartResult, EXPECT_EQ, TestInfo, ASSERT_EQ @@ -10,80 +10,83 @@ #include // for allocator, vector #include +#include // for expect_range_eq, EXPECT_RANGE_EQ -TEST(compute_sketches_test, hyperloglog_and_kmer_counts) +class compute_sketches_test : public ::testing::Test { - auto create_hashes = [&](size_t const ub_id, seqan::hibf::insert_iterator it) - { - // 0 = [0, 10000] - // 1 = [10000, 20000] - // 1 = [20000, 30000] - for (size_t i = ub_id * 10000; i < (ub_id + 1) * 10000; ++i) - it = i; - }; - +public: std::vector kmer_counts; - std::vector sketches; - - seqan::hibf::config config{}; - config.number_of_user_bins = 3; - config.input_fn = create_hashes; - config.sketch_bits = 12; - - seqan::hibf::sketch::compute_sketches(config, kmer_counts, sketches); - - ASSERT_EQ(kmer_counts.size(), 3); - ASSERT_EQ(sketches.size(), 3); - - EXPECT_EQ(kmer_counts[0], 18405); - EXPECT_EQ(kmer_counts[1], 18429); - EXPECT_EQ(kmer_counts[2], 18427); - - EXPECT_DOUBLE_EQ(sketches[0].estimate(), 18405.11680625227); - EXPECT_DOUBLE_EQ(sketches[1].estimate(), 18429.448850770688); - EXPECT_DOUBLE_EQ(sketches[2].estimate(), 18427.74236590719); -} - -TEST(compute_sketches_test, with_minHash) -{ - auto create_hashes = [&](size_t const ub_id, seqan::hibf::insert_iterator it) - { - // 0 = [0, 10000] - // 1 = [10000, 20000] - // 1 = [20000, 30000] - for (size_t i = ub_id * 10000; i < (ub_id + 1) * 10000; ++i) - it = i; - }; - - std::vector minHash_sketches; std::vector hyperloglog_sketches; + std::vector minHash_sketches; + seqan::hibf::config config; - seqan::hibf::config config{}; - config.number_of_user_bins = 3; - config.input_fn = create_hashes; - config.sketch_bits = 12; - - seqan::hibf::sketch::compute_sketches_with_minhash(config, hyperloglog_sketches, minHash_sketches); - - ASSERT_EQ(minHash_sketches.size(), 3); - ASSERT_EQ(hyperloglog_sketches.size(), 3); + void SetUp() override + { + config.number_of_user_bins = 3; + config.sketch_bits = 12; + config.input_fn = [&](size_t const ub_id, seqan::hibf::insert_iterator it) + { + // 0 = [0, 10000] + // 1 = [10000, 20000] + // 1 = [20000, 30000] + for (size_t i = ub_id * 10000; i < (ub_id + 1) * 10000; ++i) + it = i; + }; + } - EXPECT_DOUBLE_EQ(hyperloglog_sketches[0].estimate(), 18405.11680625227); - EXPECT_DOUBLE_EQ(hyperloglog_sketches[1].estimate(), 18429.448850770688); - EXPECT_DOUBLE_EQ(hyperloglog_sketches[2].estimate(), 18427.74236590719); + void check_kmer_counts() + { + ASSERT_EQ(kmer_counts.size(), 3); + EXPECT_EQ(kmer_counts[0], 18405); + EXPECT_EQ(kmer_counts[1], 18429); + EXPECT_EQ(kmer_counts[2], 18427); + } - std::vector const start{0, 625, 1250}; - for (size_t ub_id = 0; ub_id < config.number_of_user_bins; ++ub_id) + void check_hyperloglog_sketches() { - ASSERT_TRUE(minHash_sketches[ub_id].is_valid()); + ASSERT_EQ(hyperloglog_sketches.size(), 3); + constexpr double abs_error = 0.00001; + EXPECT_NEAR(hyperloglog_sketches[0].estimate(), 18405.11680625227, abs_error); + EXPECT_NEAR(hyperloglog_sketches[1].estimate(), 18429.448850770688, abs_error); + EXPECT_NEAR(hyperloglog_sketches[2].estimate(), 18427.74236590719, abs_error); + } - for (size_t sidx = 0; sidx < seqan::hibf::sketch::minHashes::num_sketches; ++sidx) + void check_minHash_sketches() + { + ASSERT_EQ(minHash_sketches.size(), 3); + size_t start{}; + size_t ub_id{}; + for (auto & minHash : minHash_sketches) { - for (size_t hidx = 0; hidx < seqan::hibf::sketch::minHashes::sketch_size; ++hidx) + ASSERT_TRUE(minHash.is_valid()) << "ub_id: " << ub_id; + size_t sid{}; + for (auto & sketch : minHash.table) { - EXPECT_EQ(minHash_sketches[ub_id].table[sidx][hidx], start[ub_id] + hidx) - << "ub_id:" << ub_id << " sidx:" << sidx << " hidx:" << hidx; + ASSERT_EQ(sketch.size(), seqan::hibf::sketch::minHashes::sketch_size) + << "ub_id: " << ub_id << "sid: " << sid; + EXPECT_RANGE_EQ(sketch, std::views::iota(start, start + seqan::hibf::sketch::minHashes::sketch_size)) << "ub_id: " << ub_id << "sid: " << sid; + ++sid; } + start += 625; + ++ub_id; } } +}; + +TEST_F(compute_sketches_test, hyperloglog_and_kmer_counts) +{ + seqan::hibf::sketch::compute_sketches(this->config, this->kmer_counts, this->hyperloglog_sketches); + + this->check_kmer_counts(); + this->check_hyperloglog_sketches(); +} + +TEST_F(compute_sketches_test, with_minHash) +{ + seqan::hibf::sketch::compute_sketches_with_minhash(this->config, + this->hyperloglog_sketches, + this->minHash_sketches); + + this->check_hyperloglog_sketches(); + this->check_minHash_sketches(); } diff --git a/test/unit/hibf/sketch/minHashes_test.cpp b/test/unit/hibf/sketch/minHashes_test.cpp index f9673868..32f38974 100644 --- a/test/unit/hibf/sketch/minHashes_test.cpp +++ b/test/unit/hibf/sketch/minHashes_test.cpp @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: 2006-2023, Knut Reinert & Freie Universität Berlin -// SPDX-FileCopyrightText: 2016-2023, Knut Reinert & MPI für molekulare Genetik +// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik // SPDX-License-Identifier: BSD-3-Clause #include // for Test, Message, TestInfo, TestPartResult, TEST, EXPECT_EQ @@ -8,77 +8,74 @@ #include // for minHashes #include // for expect_range_eq, EXPECT_RANGE_EQ -TEST(minHashes_test, static_update_heap_with) +TEST(minHashes_test, ctor) +{ + seqan::hibf::sketch::minHashes sketch{}; + + EXPECT_TRUE(sketch.table.empty()); + EXPECT_FALSE(sketch.is_valid()); +} + +TEST(minHashes_test, update_heap_with) { std::vector heap{3, 4, 5, 6, 7, 8}; - std::make_heap(heap.begin(), heap.end()); // make into proper heap + std::ranges::make_heap(heap); - // no change because 10 is larger than all values in the heap - std::vector const unchanged_result{heap}; + // No change because 10 is larger than all values in the heap + std::vector const original_heap{heap}; seqan::hibf::sketch::minHashes::update_heap_with(heap, 10); - EXPECT_RANGE_EQ(unchanged_result, heap); + EXPECT_RANGE_EQ(original_heap, heap); - // now heap changes + // Heap changes seqan::hibf::sketch::minHashes::update_heap_with(heap, 0); + EXPECT_FALSE(std::ranges::equal(original_heap, heap)); EXPECT_EQ(heap[0], 7u); // largest element is now 7 (not 8 anymore) } -TEST(minHashes_test, ctor) -{ - seqan::hibf::sketch::minHashes minHash_sketch{}; - - EXPECT_TRUE(minHash_sketch.table.empty()); - EXPECT_FALSE(minHash_sketch.is_valid()); -} - TEST(minHashes_test, ctor_sorted_list) { - std::vector const heap = seqan::hibf::iota_vector(1000); + std::vector const heap = seqan::hibf::iota_vector(1000); - seqan::hibf::sketch::minHashes minHash_sketch{heap}; + seqan::hibf::sketch::minHashes sketches{heap}; // check for correct construction - EXPECT_FALSE(minHash_sketch.table.empty()); - EXPECT_TRUE(minHash_sketch.is_valid()); + EXPECT_FALSE(sketches.table.empty()); + EXPECT_TRUE(sketches.is_valid()); - ASSERT_EQ(minHash_sketch.table.size(), seqan::hibf::sketch::minHashes::num_sketches); + ASSERT_EQ(sketches.table.size(), seqan::hibf::sketch::minHashes::num_sketches); - for (size_t sidx = 0; sidx < seqan::hibf::sketch::minHashes::num_sketches; ++sidx) + size_t sid{}; + for (auto & sketch : sketches.table) { - ASSERT_EQ(minHash_sketch.table[sidx].size(), seqan::hibf::sketch::minHashes::sketch_size); - - for (size_t hidx = 0; hidx < seqan::hibf::sketch::minHashes::num_sketches; ++hidx) - { - ASSERT_EQ(minHash_sketch.table[sidx][hidx], hidx); - } + ASSERT_EQ(sketch.size(), seqan::hibf::sketch::minHashes::sketch_size) << "sid: " << sid; + EXPECT_RANGE_EQ(sketch, std::views::iota(0u, seqan::hibf::sketch::minHashes::sketch_size)) << "sid: " << sid; + ++sid; } } TEST(minHashes_test, fill_up_sketches) { - std::vector const heap = seqan::hibf::iota_vector(10); + std::vector const heap = seqan::hibf::iota_vector(10); - seqan::hibf::sketch::minHashes minHash_sketch{heap}; // there will be too little values - EXPECT_FALSE(minHash_sketch.table.empty()); - ASSERT_FALSE(minHash_sketch.is_valid()); + seqan::hibf::sketch::minHashes sketches{heap}; // there will be too little values + EXPECT_FALSE(sketches.table.empty()); + ASSERT_FALSE(sketches.is_valid()); // recreate bigger heap - std::vector bigger_heap = seqan::hibf::iota_vector(1000); + std::vector bigger_heap = seqan::hibf::iota_vector(1000); std::span const bigger_heap_non_redundant(bigger_heap.begin() + heap.size(), bigger_heap.end()); - minHash_sketch.fill_up_sketches(bigger_heap_non_redundant); + sketches.fill_up_sketches(bigger_heap_non_redundant); // check if filling up worked correctly - ASSERT_TRUE(minHash_sketch.is_valid()); + ASSERT_TRUE(sketches.is_valid()); - ASSERT_EQ(minHash_sketch.table.size(), seqan::hibf::sketch::minHashes::num_sketches); + ASSERT_EQ(sketches.table.size(), seqan::hibf::sketch::minHashes::num_sketches); - for (size_t sidx = 0; sidx < seqan::hibf::sketch::minHashes::num_sketches; ++sidx) + size_t sid{}; + for (auto & sketch : sketches.table) { - ASSERT_EQ(minHash_sketch.table[sidx].size(), seqan::hibf::sketch::minHashes::sketch_size); - - for (size_t hidx = 0; hidx < seqan::hibf::sketch::minHashes::num_sketches; ++hidx) - { - ASSERT_EQ(minHash_sketch.table[sidx][hidx], hidx); - } + ASSERT_EQ(sketch.size(), seqan::hibf::sketch::minHashes::sketch_size) << "sid: " << sid; + EXPECT_RANGE_EQ(sketch, std::views::iota(0u, seqan::hibf::sketch::minHashes::sketch_size)) << "sid: " << sid; + ++sid; } }