review

seqan · Jul 12, 2024 · 82e9039 · 82e9039
1 parent 5d7acc9
commit 82e9039
Show file tree

Hide file tree

Showing 6 changed files with 191 additions and 142 deletions.
diff --git a/include/hibf/sketch/minHashes.hpp b/include/hibf/sketch/minHashes.hpp
@@ -1,12 +1,13 @@
-// SPDX-FileCopyrightText: 2006-2023, Knut Reinert & Freie Universität Berlin
-// SPDX-FileCopyrightText: 2016-2023, Knut Reinert & MPI für molekulare Genetik
+// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin
+// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
 // SPDX-License-Identifier: BSD-3-Clause
 
 /*!\file
  * \author Svenja Mehringer <svenja.mehringer AT fu-berlin.de>
  * \brief Provides seqan::hibf::sketch::minHashes.
  */
 
+#include <algorithm>
 #include <span>
 #include <vector>
 
@@ -36,64 +37,62 @@ struct minHashes
     ~minHashes() = default;                             //!< Defaulted.
 
     //!\brief construct from a vector of the smallest values in a set (sorted ascending).
-    minHashes(std::vector<uint64_t> const & smalles_values)
+    minHashes(std::vector<uint64_t> const & smallest_values)
     {
         table.resize(num_sketches);
 
-        for (size_t i = 0; i < num_sketches; ++i)
-        {
-            table[i].reserve(sketch_size);
-        }
+        for (auto & elem : table)
+            elem.reserve(sketch_size);
 
-        for (uint64_t const hash : smalles_values)
+        for (uint64_t const hash : smallest_values)
         {
-            uint64_t const register_id{hash & register_id_mask};
-            if (table[register_id].size() < sketch_size)
-                table[register_id].push_back(hash >> 4);
+            auto & hash_table = table[hash & register_id_mask];
+            if (hash_table.size() < sketch_size)
+                hash_table.push_back(hash >> 4);
         }
     }
 
-    static inline uint64_t const register_id_mask{15}; /// ...00001111
-    static inline size_t const num_sketches{16};
-    static inline size_t const sketch_size{40};
+    static constexpr uint64_t register_id_mask{15}; /// ...00001111
+    static constexpr size_t num_sketches{16};
+    static constexpr size_t sketch_size{40};
 
     //!\brief A table of sketches. For LSH we need multiple sketches, stored in a table.
     std::vector<std::vector<uint64_t>> table{};
 
     //!\brief Checks whether the minHash table is completely filled
     bool is_valid() const
     {
-        bool result{table.size() == num_sketches};
-        for (auto const & minHash_sketch : table)
-        {
-            if (minHash_sketch.size() < sketch_size)
-                result = false;
-        }
-        return result;
+        return table.size() == num_sketches
+            && std::ranges::all_of(table,
+                                   [](auto const & minHash_sketch)
+                                   {
+                                       return minHash_sketch.size() == sketch_size;
+                                   });
     }
 
     //!\brief Adds more minhash values to an existing but incomplete table.
-    void fill_up_sketches(std::span<uint64_t> const & more_smalles_values)
+    void fill_up_sketches(std::span<uint64_t> const & more_smallest_values)
     {
-        for (uint64_t const hash : more_smalles_values)
+        for (uint64_t const hash : more_smallest_values)
         {
-            uint64_t const register_id{hash & register_id_mask};
-            assert(table[register_id].find(hash) == table[register_id].end()); // hashes should be unique
-            if (table[register_id].size() < sketch_size)
-                table[register_id].push_back(hash >> 4);
+            auto & hash_table = table[hash & register_id_mask];
+            assert(std::ranges::find(hash_table, hash) == hash_table.end()); // hashes should be unique
+            if (hash_table.size() < sketch_size)
+                hash_table.push_back(hash >> 4);
         }
     }
 
     //!\brief Miscallenious function to update a heap with a new element, when preparing a heap for minHash sketches
     static void update_heap_with(std::vector<uint64_t> & heap, uint64_t const k_hash)
     {
-        if (k_hash < heap[0]) // 0 because it is a max heap
-        {
-            // we do not need a guard (hash table) to check for duplications because `kmers` is already a set
-            std::pop_heap(heap.begin(), heap.end());  // max elements move to end of vector
-            heap.back() = k_hash;                     // replace last elements instead of physically popping and pushing
-            std::push_heap(heap.begin(), heap.end()); // last elements is rearranged in the heap to be pushed
-        }
+        // Do nothing if k_hash is bigger than the current biggest element in the (max) heap.
+        if (k_hash >= heap[0])
+            return;
+
+        // we do not need a guard (hash table) to check for duplications because `kmers` is already a set
+        std::ranges::pop_heap(heap);  // max elements move to end of vector
+        heap.back() = k_hash;         // replace last elements instead of physically popping and pushing
+        std::ranges::push_heap(heap); // last elements is rearranged in the heap to be pushed
     }
 };
 

diff --git a/test/include/hibf/test/expect_range_eq.hpp b/test/include/hibf/test/expect_range_eq.hpp
@@ -42,7 +42,7 @@ void PrintTo(t const & value, std::ostream * out)
 namespace seqan::hibf::test
 {
 
-#define EXPECT_RANGE_EQ(val1, val2) EXPECT_PRED_FORMAT2(::seqan::hibf::test::expect_range_eq{}, val1, val2);
+#define EXPECT_RANGE_EQ(val1, val2) EXPECT_PRED_FORMAT2(::seqan::hibf::test::expect_range_eq{}, val1, val2)
 
 struct expect_range_eq
 {

diff --git a/test/performance/sketch/CMakeLists.txt b/test/performance/sketch/CMakeLists.txt
@@ -2,4 +2,5 @@
 # SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
 # SPDX-License-Identifier: BSD-3-Clause
 
+hibf_benchmark (compute_sketches_benchmark.cpp)
 hibf_benchmark (hyperloglog_benchmark.cpp)
diff --git a/test/performance/sketch/compute_sketches_benchmark.cpp b/test/performance/sketch/compute_sketches_benchmark.cpp
@@ -0,0 +1,49 @@
+// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin
+// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <benchmark/benchmark.h>
+
+#include <hibf/sketch/compute_sketches.hpp>
+
+enum class sketch : uint8_t
+{
+    Hyperloglog,
+    MinHashes
+};
+
+template <sketch sketch_t>
+void compute_sketches(benchmark::State & state)
+{
+    auto create_hashes = [&](size_t const ub_id, seqan::hibf::insert_iterator it)
+    {
+        // 0 = [0, 10000]
+        // 1 = [10000, 20000]
+        // 1 = [20000, 30000]
+        for (size_t i = ub_id * 10000; i < (ub_id + 1) * 10000; ++i)
+            it = i;
+    };
+
+    std::vector<size_t> kmer_counts;
+    [[maybe_unused]] std::vector<seqan::hibf::sketch::minHashes> minHash_sketches;
+    std::vector<seqan::hibf::sketch::hyperloglog> hyperloglog_sketches;
+
+    seqan::hibf::config config{};
+    config.number_of_user_bins = 16;
+    config.input_fn = create_hashes;
+    config.sketch_bits = 12;
+
+    for (auto _ : state)
+    {
+        if constexpr (sketch_t == sketch::MinHashes)
+            seqan::hibf::sketch::compute_sketches_with_minhash(config,
+                                                               kmer_counts,
+                                                               hyperloglog_sketches,
+                                                               minHash_sketches);
+        else
+            seqan::hibf::sketch::compute_sketches(config, kmer_counts, hyperloglog_sketches);
+    }
+}
+
+BENCHMARK_TEMPLATE(compute_sketches, sketch::Hyperloglog);
+BENCHMARK_TEMPLATE(compute_sketches, sketch::MinHashes);
diff --git a/test/unit/hibf/sketch/compute_sketches_test.cpp b/test/unit/hibf/sketch/compute_sketches_test.cpp
@@ -1,5 +1,5 @@
-// SPDX-FileCopyrightText: 2006-2023, Knut Reinert & Freie Universität Berlin
-// SPDX-FileCopyrightText: 2016-2023, Knut Reinert & MPI für molekulare Genetik
+// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin
+// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include <gtest/gtest.h> // for Test, Message, TestPartResult, EXPECT_EQ, TestInfo, ASSERT_EQ
@@ -10,80 +10,83 @@
 #include <vector>    // for allocator, vector
 
 #include <hibf/sketch/compute_sketches.hpp>
+#include <hibf/test/expect_range_eq.hpp> // for expect_range_eq, EXPECT_RANGE_EQ
 
-TEST(compute_sketches_test, hyperloglog_and_kmer_counts)
+class compute_sketches_test : public ::testing::Test
 {
-    auto create_hashes = [&](size_t const ub_id, seqan::hibf::insert_iterator it)
-    {
-        // 0 = [0, 10000]
-        // 1 = [10000, 20000]
-        // 1 = [20000, 30000]
-        for (size_t i = ub_id * 10000; i < (ub_id + 1) * 10000; ++i)
-            it = i;
-    };
-
+public:
     std::vector<size_t> kmer_counts;
-    std::vector<seqan::hibf::sketch::hyperloglog> sketches;
-
-    seqan::hibf::config config{};
-    config.number_of_user_bins = 3;
-    config.input_fn = create_hashes;
-    config.sketch_bits = 12;
-
-    seqan::hibf::sketch::compute_sketches(config, kmer_counts, sketches);
-
-    ASSERT_EQ(kmer_counts.size(), 3);
-    ASSERT_EQ(sketches.size(), 3);
-
-    EXPECT_EQ(kmer_counts[0], 18405);
-    EXPECT_EQ(kmer_counts[1], 18429);
-    EXPECT_EQ(kmer_counts[2], 18427);
-
-    EXPECT_DOUBLE_EQ(sketches[0].estimate(), 18405.11680625227);
-    EXPECT_DOUBLE_EQ(sketches[1].estimate(), 18429.448850770688);
-    EXPECT_DOUBLE_EQ(sketches[2].estimate(), 18427.74236590719);
-}
-
-TEST(compute_sketches_test, with_minHash)
-{
-    auto create_hashes = [&](size_t const ub_id, seqan::hibf::insert_iterator it)
-    {
-        // 0 = [0, 10000]
-        // 1 = [10000, 20000]
-        // 1 = [20000, 30000]
-        for (size_t i = ub_id * 10000; i < (ub_id + 1) * 10000; ++i)
-            it = i;
-    };
-
-    std::vector<seqan::hibf::sketch::minHashes> minHash_sketches;
     std::vector<seqan::hibf::sketch::hyperloglog> hyperloglog_sketches;
+    std::vector<seqan::hibf::sketch::minHashes> minHash_sketches;
+    seqan::hibf::config config;
 
-    seqan::hibf::config config{};
-    config.number_of_user_bins = 3;
-    config.input_fn = create_hashes;
-    config.sketch_bits = 12;
-
-    seqan::hibf::sketch::compute_sketches_with_minhash(config, hyperloglog_sketches, minHash_sketches);
-
-    ASSERT_EQ(minHash_sketches.size(), 3);
-    ASSERT_EQ(hyperloglog_sketches.size(), 3);
+    void SetUp() override
+    {
+        config.number_of_user_bins = 3;
+        config.sketch_bits = 12;
+        config.input_fn = [&](size_t const ub_id, seqan::hibf::insert_iterator it)
+        {
+            // 0 = [0, 10000]
+            // 1 = [10000, 20000]
+            // 1 = [20000, 30000]
+            for (size_t i = ub_id * 10000; i < (ub_id + 1) * 10000; ++i)
+                it = i;
+        };
+    }
 
-    EXPECT_DOUBLE_EQ(hyperloglog_sketches[0].estimate(), 18405.11680625227);
-    EXPECT_DOUBLE_EQ(hyperloglog_sketches[1].estimate(), 18429.448850770688);
-    EXPECT_DOUBLE_EQ(hyperloglog_sketches[2].estimate(), 18427.74236590719);
+    void check_kmer_counts()
+    {
+        ASSERT_EQ(kmer_counts.size(), 3);
+        EXPECT_EQ(kmer_counts[0], 18405);
+        EXPECT_EQ(kmer_counts[1], 18429);
+        EXPECT_EQ(kmer_counts[2], 18427);
+    }
 
-    std::vector<size_t> const start{0, 625, 1250};
-    for (size_t ub_id = 0; ub_id < config.number_of_user_bins; ++ub_id)
+    void check_hyperloglog_sketches()
     {
-        ASSERT_TRUE(minHash_sketches[ub_id].is_valid());
+        ASSERT_EQ(hyperloglog_sketches.size(), 3);
+        constexpr double abs_error = 0.00001;
+        EXPECT_NEAR(hyperloglog_sketches[0].estimate(), 18405.11680625227, abs_error);
+        EXPECT_NEAR(hyperloglog_sketches[1].estimate(), 18429.448850770688, abs_error);
+        EXPECT_NEAR(hyperloglog_sketches[2].estimate(), 18427.74236590719, abs_error);
+    }
 
-        for (size_t sidx = 0; sidx < seqan::hibf::sketch::minHashes::num_sketches; ++sidx)
+    void check_minHash_sketches()
+    {
+        ASSERT_EQ(minHash_sketches.size(), 3);
+        size_t start{};
+        size_t ub_id{};
+        for (auto & minHash : minHash_sketches)
         {
-            for (size_t hidx = 0; hidx < seqan::hibf::sketch::minHashes::sketch_size; ++hidx)
+            ASSERT_TRUE(minHash.is_valid()) << "ub_id: " << ub_id;
+            size_t sid{};
+            for (auto & sketch : minHash.table)
             {
-                EXPECT_EQ(minHash_sketches[ub_id].table[sidx][hidx], start[ub_id] + hidx)
-                    << "ub_id:" << ub_id << " sidx:" << sidx << " hidx:" << hidx;
+                ASSERT_EQ(sketch.size(), seqan::hibf::sketch::minHashes::sketch_size)
+                    << "ub_id: " << ub_id << "sid: " << sid;
+                EXPECT_RANGE_EQ(sketch, std::views::iota(start, start + seqan::hibf::sketch::minHashes::sketch_size)) << "ub_id: " << ub_id << "sid: " << sid;
+                ++sid;
             }
+            start += 625;
+            ++ub_id;
         }
     }
+};
+
+TEST_F(compute_sketches_test, hyperloglog_and_kmer_counts)
+{
+    seqan::hibf::sketch::compute_sketches(this->config, this->kmer_counts, this->hyperloglog_sketches);
+
+    this->check_kmer_counts();
+    this->check_hyperloglog_sketches();
+}
+
+TEST_F(compute_sketches_test, with_minHash)
+{
+    seqan::hibf::sketch::compute_sketches_with_minhash(this->config,
+                                                       this->hyperloglog_sketches,
+                                                       this->minHash_sketches);
+
+    this->check_hyperloglog_sketches();
+    this->check_minHash_sketches();
 }