From e9bb7eb0eeebd96c0d280838c2c823ce0b6d25fd Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Fri, 17 Nov 2023 16:48:43 +0100 Subject: [PATCH] [TEST] HLL benchmark --- src/sketch/hyperloglog.cpp | 47 ++++++++++++-- test/performance/sketch/CMakeLists.txt | 5 ++ .../sketch/hyperloglog_benchmark.cpp | 64 +++++++++++++++++++ 3 files changed, 111 insertions(+), 5 deletions(-) create mode 100644 test/performance/sketch/CMakeLists.txt create mode 100644 test/performance/sketch/hyperloglog_benchmark.cpp diff --git a/src/sketch/hyperloglog.cpp b/src/sketch/hyperloglog.cpp index 11dc765d..26dfeb47 100644 --- a/src/sketch/hyperloglog.cpp +++ b/src/sketch/hyperloglog.cpp @@ -71,18 +71,55 @@ void hyperloglog::add(uint64_t const value) double hyperloglog::estimate() const { - float sum = 0.0; + simde__m256i const * const raw = reinterpret_cast(data.data()); + simde__m256 packed_sum = simde_mm256_set1_ps(0.0f); -#pragma omp simd - for (size_t i = 0; i < size; ++i) - sum += expectation_values[data[i]]; + // We can do 256 bits = 32 bytes at once. + // We store `uint8_t`, so `size` is the size in bytes. + // Hence, we need to do `size / 32` iterations. + // The loop is equalivalent to: + // float sum{}; + // for (size_t i = 0; i < size; ++i) + // sum += expectation_values[data[i]]; + for (size_t i = 0; i < size / 32; ++i) + { + // Points to the start of the 32-byte block we are processing. + uint8_t const * const current = reinterpret_cast(raw + i); + + // Sum up 32 values. + for (size_t j = 0; j <= 24; j += 8) + { + packed_sum = simde_mm256_add_ps(packed_sum, + simde_mm256_set_ps(expectation_values[*(current + j + 0)], + expectation_values[*(current + j + 1)], + expectation_values[*(current + j + 2)], + expectation_values[*(current + j + 3)], + expectation_values[*(current + j + 4)], + expectation_values[*(current + j + 5)], + expectation_values[*(current + j + 6)], + expectation_values[*(current + j + 7)])); + } + } + + float sum{}; + + // Sum up the 8 float values in packed_sum. + float const * const sum_it = reinterpret_cast(&packed_sum); + for (size_t i = 0; i < 8; ++i) + { + sum += *(sum_it + i); + } double estimate = normalization_factor / sum; // Small value correction: linear counting of zeros if (estimate <= 2.5 * size) { - uint32_t const zeros = std::ranges::count(data, uint8_t{}); + uint32_t zeros{}; + + for (size_t i = 0; i < size; ++i) + zeros += (data[i] == 0u); + if (zeros != 0u) estimate = size * std::log(static_cast(size) / zeros); } diff --git a/test/performance/sketch/CMakeLists.txt b/test/performance/sketch/CMakeLists.txt new file mode 100644 index 00000000..2e050806 --- /dev/null +++ b/test/performance/sketch/CMakeLists.txt @@ -0,0 +1,5 @@ +# SPDX-FileCopyrightText: 2006-2023, Knut Reinert & Freie Universität Berlin +# SPDX-FileCopyrightText: 2016-2023, Knut Reinert & MPI für molekulare Genetik +# SPDX-License-Identifier: BSD-3-Clause + +hibf_benchmark (hyperloglog_benchmark.cpp) diff --git a/test/performance/sketch/hyperloglog_benchmark.cpp b/test/performance/sketch/hyperloglog_benchmark.cpp new file mode 100644 index 00000000..a97f3794 --- /dev/null +++ b/test/performance/sketch/hyperloglog_benchmark.cpp @@ -0,0 +1,64 @@ +// SPDX-FileCopyrightText: 2006-2023, Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2023, Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#include // for State, Benchmark, Counter, ClobberMemory, BENCHMARK + +#include + +#include + +static void add(benchmark::State & state) +{ + uint8_t const bits = static_cast(state.range(0)); + seqan::hibf::sketch::hyperloglog hll{bits}; + for (auto _ : state) + { + hll.add(23); + } + benchmark::DoNotOptimize(hll); +} + +static void merge(benchmark::State & state) +{ + uint8_t const bits = static_cast(state.range(0)); + seqan::hibf::sketch::hyperloglog hll{bits}; + for (auto _ : state) + { + hll.merge(hll); + } + benchmark::DoNotOptimize(hll); +} + +static void estimate(benchmark::State & state) +{ + uint8_t const bits = static_cast(state.range(0)); + seqan::hibf::sketch::hyperloglog hll{bits}; + double estimate{}; + for (auto _ : state) + { + estimate = hll.estimate(); + } + benchmark::DoNotOptimize(hll); + benchmark::DoNotOptimize(estimate); +} + +static void merge_and_estimate(benchmark::State & state) +{ + uint8_t const bits = static_cast(state.range(0)); + seqan::hibf::sketch::hyperloglog hll{bits}; + double estimate{}; + for (auto _ : state) + { + estimate = hll.merge_and_estimate(hll); + } + benchmark::DoNotOptimize(hll); + benchmark::DoNotOptimize(estimate); +} + +BENCHMARK(add)->DenseRange(5, 12, 1); +BENCHMARK(merge)->DenseRange(5, 12, 1); +BENCHMARK(estimate)->DenseRange(5, 12, 1); +BENCHMARK(merge_and_estimate)->DenseRange(5, 12, 1); + +BENCHMARK_MAIN();