From e9bb7eb0eeebd96c0d280838c2c823ce0b6d25fd Mon Sep 17 00:00:00 2001
From: Enrico Seiler <enrico.seiler@hotmail.de>
Date: Fri, 17 Nov 2023 16:48:43 +0100
Subject: [PATCH] [TEST] HLL benchmark

---
 src/sketch/hyperloglog.cpp                    | 47 ++++++++++++--
 test/performance/sketch/CMakeLists.txt        |  5 ++
 .../sketch/hyperloglog_benchmark.cpp          | 64 +++++++++++++++++++
 3 files changed, 111 insertions(+), 5 deletions(-)
 create mode 100644 test/performance/sketch/CMakeLists.txt
 create mode 100644 test/performance/sketch/hyperloglog_benchmark.cpp
diff --git a/src/sketch/hyperloglog.cpp b/src/sketch/hyperloglog.cpp
index 11dc765d..26dfeb47 100644
--- a/src/sketch/hyperloglog.cpp
+++ b/src/sketch/hyperloglog.cpp
@@ -71,18 +71,55 @@ void hyperloglog::add(uint64_t const value)
 
 double hyperloglog::estimate() const
 {
-    float sum = 0.0;
+    simde__m256i const * const raw = reinterpret_cast<simde__m256i const *>(data.data());
+    simde__m256 packed_sum = simde_mm256_set1_ps(0.0f);
 
-#pragma omp simd
-    for (size_t i = 0; i < size; ++i)
-        sum += expectation_values[data[i]];
+    // We can do 256 bits = 32 bytes at once.
+    // We store `uint8_t`, so `size` is the size in bytes.
+    // Hence, we need to do `size / 32` iterations.
+    // The loop is equalivalent to:
+    // float sum{};
+    // for (size_t i = 0; i < size; ++i)
+    //    sum += expectation_values[data[i]];
+    for (size_t i = 0; i < size / 32; ++i)
+    {
+        // Points to the start of the 32-byte block we are processing.
+        uint8_t const * const current = reinterpret_cast<uint8_t const *>(raw + i);
+
+        // Sum up 32 values.
+        for (size_t j = 0; j <= 24; j += 8)
+        {
+            packed_sum = simde_mm256_add_ps(packed_sum,
+                                            simde_mm256_set_ps(expectation_values[*(current + j + 0)],
+                                                               expectation_values[*(current + j + 1)],
+                                                               expectation_values[*(current + j + 2)],
+                                                               expectation_values[*(current + j + 3)],
+                                                               expectation_values[*(current + j + 4)],
+                                                               expectation_values[*(current + j + 5)],
+                                                               expectation_values[*(current + j + 6)],
+                                                               expectation_values[*(current + j + 7)]));
+        }
+    }
+
+    float sum{};
+
+    // Sum up the 8 float values in packed_sum.
+    float const * const sum_it = reinterpret_cast<float const *>(&packed_sum);
+    for (size_t i = 0; i < 8; ++i)
+    {
+        sum += *(sum_it + i);
+    }
 
     double estimate = normalization_factor / sum;
 
     // Small value correction: linear counting of zeros
     if (estimate <= 2.5 * size)
     {
-        uint32_t const zeros = std::ranges::count(data, uint8_t{});
+        uint32_t zeros{};
+
+        for (size_t i = 0; i < size; ++i)
+            zeros += (data[i] == 0u);
+
         if (zeros != 0u)
             estimate = size * std::log(static_cast<double>(size) / zeros);
     }
diff --git a/test/performance/sketch/CMakeLists.txt b/test/performance/sketch/CMakeLists.txt
new file mode 100644
index 00000000..2e050806
--- /dev/null
+++ b/test/performance/sketch/CMakeLists.txt
@@ -0,0 +1,5 @@
+# SPDX-FileCopyrightText: 2006-2023, Knut Reinert & Freie Universität Berlin
+# SPDX-FileCopyrightText: 2016-2023, Knut Reinert & MPI für molekulare Genetik
+# SPDX-License-Identifier: BSD-3-Clause
+
+hibf_benchmark (hyperloglog_benchmark.cpp)
diff --git a/test/performance/sketch/hyperloglog_benchmark.cpp b/test/performance/sketch/hyperloglog_benchmark.cpp
new file mode 100644
index 00000000..a97f3794
--- /dev/null
+++ b/test/performance/sketch/hyperloglog_benchmark.cpp
@@ -0,0 +1,64 @@
+// SPDX-FileCopyrightText: 2006-2023, Knut Reinert & Freie Universität Berlin
+// SPDX-FileCopyrightText: 2016-2023, Knut Reinert & MPI für molekulare Genetik
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <benchmark/benchmark.h> // for State, Benchmark, Counter, ClobberMemory, BENCHMARK
+
+#include <random>
+
+#include <hibf/sketch/hyperloglog.hpp>
+
+static void add(benchmark::State & state)
+{
+    uint8_t const bits = static_cast<uint8_t>(state.range(0));
+    seqan::hibf::sketch::hyperloglog hll{bits};
+    for (auto _ : state)
+    {
+        hll.add(23);
+    }
+    benchmark::DoNotOptimize(hll);
+}
+
+static void merge(benchmark::State & state)
+{
+    uint8_t const bits = static_cast<uint8_t>(state.range(0));
+    seqan::hibf::sketch::hyperloglog hll{bits};
+    for (auto _ : state)
+    {
+        hll.merge(hll);
+    }
+    benchmark::DoNotOptimize(hll);
+}
+
+static void estimate(benchmark::State & state)
+{
+    uint8_t const bits = static_cast<uint8_t>(state.range(0));
+    seqan::hibf::sketch::hyperloglog hll{bits};
+    double estimate{};
+    for (auto _ : state)
+    {
+        estimate = hll.estimate();
+    }
+    benchmark::DoNotOptimize(hll);
+    benchmark::DoNotOptimize(estimate);
+}
+
+static void merge_and_estimate(benchmark::State & state)
+{
+    uint8_t const bits = static_cast<uint8_t>(state.range(0));
+    seqan::hibf::sketch::hyperloglog hll{bits};
+    double estimate{};
+    for (auto _ : state)
+    {
+        estimate = hll.merge_and_estimate(hll);
+    }
+    benchmark::DoNotOptimize(hll);
+    benchmark::DoNotOptimize(estimate);
+}
+
+BENCHMARK(add)->DenseRange(5, 12, 1);
+BENCHMARK(merge)->DenseRange(5, 12, 1);
+BENCHMARK(estimate)->DenseRange(5, 12, 1);
+BENCHMARK(merge_and_estimate)->DenseRange(5, 12, 1);
+
+BENCHMARK_MAIN();