diff --git a/include/hibf/sketch/hyperloglog.hpp b/include/hibf/sketch/hyperloglog.hpp index 875f89fc..aa16f0f1 100644 --- a/include/hibf/sketch/hyperloglog.hpp +++ b/include/hibf/sketch/hyperloglog.hpp @@ -1,15 +1,21 @@ -#pragma once - -/** - * @file hyperloglog.hpp - * @brief HyperLogLog cardinality estimator - * @date Created 2013/3/20, Adjusted 2021/01 - * @author Hideaki Ohno - * - * Copied from Hideaki Ohno (https://github.com/hideo55/cpp-HyperLogLog) and adjusted/improved by Felix Droop - * Modified a lot for a bugfix, improvements and functional changes (64 bit hashes) +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +// Copyright (c) 2013 Hideaki Ohno +// MIT License: https://github.com/hideo55/cpp-HyperLogLog#license + +/*!\file + * \author Felix Droop + * \author Enrico Seiler + * \brief Provides seqan::hibf::sketch::hyperloglog. */ +#pragma once + #include // for copy #include // for array #include // for uint64_t, uint8_t @@ -26,22 +32,33 @@ namespace seqan::hibf::sketch /*!\brief HyperLogLog estimates. * \ingroup hibf_sketch * \details - * Copied from Hideaki Ohno and adjusted/improved by Felix Droop + * Original work by Hideaki Ohno. Major changes have been applied for bugfixes, 64-bit support, improvements, etc. * \see https://github.com/hideo55/cpp-HyperLogLog */ class hyperloglog { public: - /*!\brief Constructor - * \param[in] b bit width (register size will be 2 to the b power). - * This value must be in the range [5,32]. Default value is 5. + /*!\name Constructors, destructor and assignment + * \{ + */ + /*!\brief Default constructor. + * \param[in] bits The bit width in [5,32]. * - * \throws std::invalid_argument if the argument b is out of range. + * Allocates 2^`bits` bytes of memory. + * + * \throws std::invalid_argument if bits is not in [5,32]. */ - hyperloglog(uint8_t const b = 5u); + hyperloglog(uint8_t const bits = 5u); + hyperloglog(hyperloglog const &) = default; //!< Defaulted. + hyperloglog & operator=(hyperloglog const &) = default; //!< Defaulted. + hyperloglog(hyperloglog &&) = default; //!< Defaulted. + hyperloglog & operator=(hyperloglog &&) = default; //!< Defaulted. + ~hyperloglog() = default; //!< Defaulted. + + //!\} - /*!\brief Adds an unsigned 64-bit integer to the estimator. - * \param[in] value unsigned integer to add + /*!\brief Adds a value. + * \param[in] value The value to add. */ void add(uint64_t const value); @@ -50,66 +67,70 @@ class hyperloglog */ double estimate() const; - /*!\brief Merges the estimate from 'other' into this object. - * \param[in] other HyperLogLog instance to be merged + /*!\brief Merges another hyperloglog into this object. + * \param[in] other The hyperloglog to be merged. * \details - * The number of registers in each must be the same. + * This has the same effect as adding all values that were added to `other`. + * \warning + * Merging a hyperloglog with differing `bits` is undefined behaviour. In debug mode, this is an assertion instead. */ void merge(hyperloglog const & other); - /*!\brief Merges the estimate from 'other' into this object - * \param[in] other HyperLogLog instance to be merged - * \returns estimated cardinality of the new merged sketch. + /*!\brief Merges another hyperloglog and returns the new estimate. + * \param[in] other The hyperloglog to be merged. + * \returns Estimated cardinality value. * \details - * The number of registers in each must be the same. - * This function is implemented using SIMD instructions. - * \warning This function is undefined bevahior if this.b_ == 4 + * \warning + * Merging a hyperloglog with differing `bits` is undefined behaviour. In debug mode, this is an assertion instead. */ - double merge_and_estimate_SIMD(hyperloglog const & other); + double merge_and_estimate(hyperloglog const & other); - /*!\brief Clears all internal registers. + /*!\brief Clears added values. + * The size is unaffected. */ - void clear(); + void reset(); - /*!\brief Returns size of register. + /*!\brief Returns size of the internal data. + * \returns Size in bytes. + * The returned value is equivalent to 2^`bits`. */ - uint64_t registerSize() const + uint64_t data_size() const { - return m_; + return size; } - /*!\brief Exchanges the content of the instance. - * \param[in,out] rhs Another HyperLogLog instance - */ - void swap(hyperloglog & rhs); - - /*!\brief Dumps the current status to a stream. - * \param[in,out] os The output stream where the data is saved to - * \throws std::runtime_error if dumping failed. + /*!\brief Write the hyperloglog to a stream. + * \param[in,out] os The output stream to write to. + * \throws std::runtime_error if storing failed. */ - void dump(std::ostream & os) const; + void store(std::ostream & os) const; - /*!\brief Restorse the status from a stream. - * \param[in] is The input stream where the status is saved - * \throws std::runtime_error if restoring failed. + /*!\brief Loads the hyperloglog from a stream. + * \param[in] is The input stream where to read from. + * \throws std::runtime_error if reading failed. */ - void restore(std::istream & is); + void load(std::istream & is); private: - static constexpr std::array exp2_rcp = []() constexpr + //!\brief Used for estimation. Part of estimate E in the HyperLogLog publication. + static constexpr std::array expectation_values = []() constexpr { - std::array arr{}; + std::array result{}; for (size_t i = 0; i < 61; ++i) - arr[i] = 1.0f / static_cast(1ULL << i); - return arr; + result[i] = 1.0f / (1ULL << i); + return result; }(); - uint64_t mask_{}; //!< mask for the rank bits - double alphaMM_{}; //!< alpha * m^2 - float alphaMM_float_{}; //!< alpha * m^2 - uint64_t m_{}; //!< register size - uint8_t b_{}; //!< register bit width - std::vector> M_{}; //!< registers + //!\brief The bit width. Also called precision, b, and p in other publications. + uint8_t bits{}; + //!\brief Equivalent to 2^bits. Called m in original publication. + uint64_t size{}; + //!\brief Mask used in add(). + uint64_t rank_mask{}; + //!\brief Equivalent to alpha * m^2. + double normalization_factor{}; + //!\brief Internal data. Also called register in publications. + std::vector> data{}; }; } // namespace seqan::hibf::sketch diff --git a/src/sketch/hyperloglog.cpp b/src/sketch/hyperloglog.cpp index f83a5e4e..b136b93b 100644 --- a/src/sketch/hyperloglog.cpp +++ b/src/sketch/hyperloglog.cpp @@ -20,31 +20,29 @@ namespace seqan::hibf::sketch { -hyperloglog::hyperloglog(uint8_t const b) : m_{1ULL << b}, b_{b}, M_(m_, 0u) +hyperloglog::hyperloglog(uint8_t const bits) : bits{bits}, size{1ULL << bits}, data(size, 0u) { - if (b_ < 5u || b_ > 32u) + if (bits < 5u || bits > 32u) throw std::invalid_argument("[HyperLogLog] bit width must be in the range [5,32]."); - M_.shrink_to_fit(); - double alpha; + double correction_factor{}; // bias-correction, "alpha" - switch (m_) + switch (size) { case 32: - alpha = 0.697; + correction_factor = 0.697; break; case 64: - alpha = 0.709; + correction_factor = 0.709; break; default: - alpha = 0.7213 / (1.0 + 1.079 / m_); + correction_factor = 0.7213 / (1.0 + 1.079 / size); break; } - alphaMM_ = alpha * m_ * m_; - alphaMM_float_ = static_cast(alphaMM_); - // 64 bits where the last b are ones and the rest zeroes - mask_ = (1ULL << b_) - 1u; + normalization_factor = correction_factor * size * size; + // Last `bits` bits are 1. + rank_mask = (1ULL << bits) - 1u; } // See https://github.com/wangyi-fudan/wyhash @@ -59,188 +57,100 @@ hyperloglog::hyperloglog(uint8_t const b) : m_{1ULL << b}, b_{b}, M_(m_, 0u) void hyperloglog::add(uint64_t const value) { uint64_t const hash = wyhash(value); - // the first b_ bits are used to distribute the leading zero counts along M_ - uint64_t const index = hash >> (64 - b_); - // the bitwise-or with mask_ assures that we get at most 64 - b_ as value. - // Otherwise the count for hash = 0 would be 64 - uint8_t const rank = std::countl_zero((hash << b_) | mask_) + 1; - M_[index] = std::max(rank, M_[index]); + // The first bits bits are used to distribute the leading zero counts over data. + uint64_t const index = hash >> (64 - bits); + // rank_mask ensures that the lzcount is at most 64 - bits. + uint8_t const rank = std::countl_zero((hash << bits) | rank_mask) + 1; + data[index] = std::max(rank, data[index]); } double hyperloglog::estimate() const { - // compute indicator formula - double sum = 0.0; - for (uint8_t c : M_) - sum += exp2_rcp[c]; - double estimate = alphaMM_ / sum; - - // use linear counting of zeros for small values - if (estimate <= 2.5 * m_) - { - uint32_t zeros{}; + float sum = 0.0; - for (size_t i = 0; i < m_; ++i) - zeros += (M_[i] == 0u); +#pragma omp simd + for (size_t i = 0; i < size; ++i) + sum += expectation_values[data[i]]; + double estimate = normalization_factor / sum; + + // Small value correction: linear counting of zeros + if (estimate <= 2.5 * size) + { + uint32_t const zeros = std::ranges::count(data, uint8_t{}); if (zeros != 0u) - estimate = m_ * std::log(static_cast(m_) / static_cast(zeros)); + estimate = size * std::log(static_cast(size) / zeros); } + return estimate; } void hyperloglog::merge(hyperloglog const & other) { - assert(m_ == other.m_); + assert(size == other.size); - for (size_t i = 0; i < m_; ++i) - { - if (M_[i] < other.M_[i]) - { - M_[i] = other.M_[i]; - } - } -} - -double hyperloglog::merge_and_estimate_SIMD(hyperloglog const & other) -{ - assert(m_ == other.m_); - assert(b_ >= 5); + simde__m256i * const it = reinterpret_cast(data.data()); + simde__m256i const * const other_it = reinterpret_cast(other.data.data()); - // this is safe when b_ is at least 5. Then, M_'s size in bits is - // 2^x * 2^5 * 8 = 2^x * 256 >= 256, where x is an integer >= 1 - // also, M_ is 256 bit aligned in memory - simde__m256i * it = reinterpret_cast(&*(M_.begin())); - simde__m256i const * other_it = reinterpret_cast(&*(other.M_.begin())); - simde__m256i * end = reinterpret_cast(&*(M_.end())); - - simde__m256 packed_sum = simde_mm256_set1_ps(0.0f); - - for (; it != end; ++it, ++other_it) + // We can do 256 bits = 32 bytes at once. + // We store `uint8_t`, so `size` is the size in bytes. + // Hence, we need to do `size / 32` iterations. + for (size_t i = 0; i < size / 32; ++i) { - // this merges the registers by computing the byte-wise maximum - *it = simde_mm256_max_epu8(*it, *other_it); - - // get pointer to iterate over the single merged registers - uint8_t * reg_it = reinterpret_cast(it); - - // get floats with two to the power of minus the value in the merged registers and sum up - packed_sum = simde_mm256_add_ps(packed_sum, - simde_mm256_set_ps(exp2_rcp[*reg_it], - exp2_rcp[*(reg_it + 1)], - exp2_rcp[*(reg_it + 2)], - exp2_rcp[*(reg_it + 3)], - exp2_rcp[*(reg_it + 4)], - exp2_rcp[*(reg_it + 5)], - exp2_rcp[*(reg_it + 6)], - exp2_rcp[*(reg_it + 7)])); - - // repeat 3 times... - packed_sum = simde_mm256_add_ps(packed_sum, - simde_mm256_set_ps(exp2_rcp[*(reg_it + 8)], - exp2_rcp[*(reg_it + 9)], - exp2_rcp[*(reg_it + 10)], - exp2_rcp[*(reg_it + 11)], - exp2_rcp[*(reg_it + 12)], - exp2_rcp[*(reg_it + 13)], - exp2_rcp[*(reg_it + 14)], - exp2_rcp[*(reg_it + 15)])); - - packed_sum = simde_mm256_add_ps(packed_sum, - simde_mm256_set_ps(exp2_rcp[*(reg_it + 16)], - exp2_rcp[*(reg_it + 17)], - exp2_rcp[*(reg_it + 18)], - exp2_rcp[*(reg_it + 19)], - exp2_rcp[*(reg_it + 20)], - exp2_rcp[*(reg_it + 21)], - exp2_rcp[*(reg_it + 22)], - exp2_rcp[*(reg_it + 23)])); - - packed_sum = simde_mm256_add_ps(packed_sum, - simde_mm256_set_ps(exp2_rcp[*(reg_it + 24)], - exp2_rcp[*(reg_it + 25)], - exp2_rcp[*(reg_it + 26)], - exp2_rcp[*(reg_it + 27)], - exp2_rcp[*(reg_it + 28)], - exp2_rcp[*(reg_it + 29)], - exp2_rcp[*(reg_it + 30)], - exp2_rcp[*(reg_it + 31)])); + it[i] = simde_mm256_max_epu8(it[i], other_it[i]); } - - // sum up the 4 values in the packed SSE variable - float sum = 0.0; - float * sum_it = reinterpret_cast(&packed_sum); - sum += *sum_it; - sum += *(sum_it + 1); - sum += *(sum_it + 2); - sum += *(sum_it + 3); - sum += *(sum_it + 4); - sum += *(sum_it + 5); - sum += *(sum_it + 6); - sum += *(sum_it + 7); - - // compute first estimate - double estimate = alphaMM_float_ / sum; - - // use linear counting of zeros for small values - if (estimate <= 2.5 * m_) - { - uint32_t zeros{}; - - for (size_t i = 0; i < m_; ++i) - zeros += (M_[i] == 0u); - - if (zeros != 0u) - estimate = m_ * std::log(static_cast(m_) / static_cast(zeros)); - } - - return estimate; } -void hyperloglog::clear() +double hyperloglog::merge_and_estimate(hyperloglog const & other) { - std::fill(M_.begin(), M_.end(), 0); + merge(other); + return estimate(); } -void hyperloglog::swap(hyperloglog & rhs) +void hyperloglog::reset() { - std::swap(mask_, rhs.mask_); - std::swap(alphaMM_, rhs.alphaMM_); - std::swap(alphaMM_float_, rhs.alphaMM_float_); - std::swap(m_, rhs.m_); - std::swap(b_, rhs.b_); - M_.swap(rhs.M_); + std::ranges::fill(data, 0u); } -void hyperloglog::dump(std::ostream & os) const +void hyperloglog::store(std::ostream & os) const { - os.write((char *)&b_, sizeof(b_)); - os.write((char *)&M_[0], sizeof(M_[0]) * M_.size()); + assert(data.size() == size); + + char const * const bits_ptr = reinterpret_cast(std::addressof(bits)); + os.write(bits_ptr, sizeof(bits)); + + char const * const data_ptr = reinterpret_cast(data.data()); + os.write(data_ptr, sizeof(data[0]) * size); + os.flush(); if (os.fail()) { - throw std::runtime_error("[HyperLogLog] Failed to dump a HyperLogLog sketch to a file."); + throw std::runtime_error("[HyperLogLog] Failed to store a HyperLogLog sketch to a file."); } } -void hyperloglog::restore(std::istream & is) +void hyperloglog::load(std::istream & is) { try { - uint8_t b{}; - is.read((char *)&b, sizeof(b)); - hyperloglog tempHLL{b}; // Constructor might throw std::invalid_argument - is.read((char *)&(tempHLL.M_[0]), sizeof(M_[0]) * tempHLL.m_); + uint8_t restore_bits{}; + char * const bits_ptr = reinterpret_cast(std::addressof(restore_bits)); + is.read(bits_ptr, sizeof(restore_bits)); + + hyperloglog restore_hll{restore_bits}; // Constructor might throw std::invalid_argument + + char * const data_ptr = reinterpret_cast(restore_hll.data.data()); + is.read(data_ptr, sizeof(data[0]) * restore_hll.size); + if (is.fail()) { - throw std::runtime_error("[HyperLogLog] Failed to restore a HyperLogLog sketch from a file: I/O error."); + throw std::runtime_error("[HyperLogLog] Failed to load a HyperLogLog sketch from a file: I/O error."); } - swap(tempHLL); + std::swap(*this, restore_hll); } catch (std::invalid_argument const & err) { - throw std::runtime_error( - "[HyperLogLog] Failed to restore a HyperLogLog sketch from a file: Invalid bit_width."); + throw std::runtime_error("[HyperLogLog] Failed to load a HyperLogLog sketch from a file: Invalid bit_width."); } } diff --git a/src/sketch/toolbox.cpp b/src/sketch/toolbox.cpp index 3508528e..9b39c364 100644 --- a/src/sketch/toolbox.cpp +++ b/src/sketch/toolbox.cpp @@ -44,7 +44,7 @@ void precompute_union_estimates_for(std::vector & estimates, estimates[j] = counts[positions[j]]; for (int64_t j_prime = j - 1; j_prime >= 0; --j_prime) - estimates[j_prime] = static_cast(temp_hll.merge_and_estimate_SIMD(sketches[positions[j_prime]])); + estimates[j_prime] = static_cast(temp_hll.merge_and_estimate(sketches[positions[j_prime]])); } void precompute_initial_union_estimates(std::vector & estimates, @@ -62,7 +62,7 @@ void precompute_initial_union_estimates(std::vector & estimates, estimates[0] = counts[positions[0]]; for (size_t j = 1; j < positions.size(); ++j) - estimates[j] = static_cast(temp_hll.merge_and_estimate_SIMD(sketches[positions[j]])); + estimates[j] = static_cast(temp_hll.merge_and_estimate(sketches[positions[j]])); } #if 0 // Currently unused @@ -215,7 +215,7 @@ void cluster_bins(std::vector const & sketches, { // this must be a copy, because merging changes the hll sketch hyperloglog temp_hll = clustering[i].hll; - double const estimate_ij = temp_hll.merge_and_estimate_SIMD(clustering[j].hll); + double const estimate_ij = temp_hll.merge_and_estimate(clustering[j].hll); // Jaccard distance estimate double const distance = 2 - (estimates[i] + estimates[j]) / estimate_ij; dist[i].pq.push({j + first, distance}); @@ -274,8 +274,7 @@ void cluster_bins(std::vector const & sketches, // merge the two nodes with minimal distance together insert the new node into the clustering clustering.push_back({min_id, neighbor_id, std::move(clustering[min_id - first].hll)}); - estimates.emplace_back( - clustering.back().hll.merge_and_estimate_SIMD(clustering[neighbor_id - first].hll)); + estimates.emplace_back(clustering.back().hll.merge_and_estimate(clustering[neighbor_id - first].hll)); // remove old ids remaining_ids.erase(min_id); @@ -309,9 +308,9 @@ void cluster_bins(std::vector const & sketches, if (other_id == new_id || !remaining_ids.contains(other_id)) continue; - // this must be a copy, because merge_and_estimate_SIMD() changes the hll + // this must be a copy, because merge_and_estimate() changes the hll hyperloglog temp_hll = new_hll; - double const estimate_ij = temp_hll.merge_and_estimate_SIMD(clustering[other_id - first].hll); + double const estimate_ij = temp_hll.merge_and_estimate(clustering[other_id - first].hll); // Jaccard distance estimate double const distance = 2 - (estimates[other_id - first] + estimates.back()) / estimate_ij; dist[i].pq.push({new_id, distance}); diff --git a/test/unit/hibf/sketch/hyperloglog_test.cpp b/test/unit/hibf/sketch/hyperloglog_test.cpp index 54887caf..d9e1ab11 100644 --- a/test/unit/hibf/sketch/hyperloglog_test.cpp +++ b/test/unit/hibf/sketch/hyperloglog_test.cpp @@ -1,17 +1,17 @@ #include // for Test, TestInfo, Message, TestPartResult, TEST, EXPECT_EQ, EXPE... -#include // for uint8_t -#include // for size_t -#include // for path -#include // for ofstream, ifstream, basic_ostream::write, ios -#include // for uniform_int_distribution, mt19937_64 -#include // for iota_view, operator==, _Iota, iota -#include // for runtime_error, invalid_argument -#include // for allocator, basic_string, hash, string, char_traits, operator== -#include // for string_view -#include // for unordered_set -#include // for vector - +#include // for uint8_t +#include // for size_t +#include // for path +#include // for ofstream, ifstream, basic_ostream::write, ios +#include // for uniform_int_distribution, mt19937_64 +#include // for iota_view, operator==, _Iota, iota +#include // for runtime_error, invalid_argument +#include // for allocator, basic_string, hash, string, char_traits, operator== +#include // for string_view +#include // for vector + +#include // for unordered_flat_set #include // for chunk_view, operator==, chunk, chunk_fn #include // for hyperloglog #include // for operator/, sandboxed_path @@ -27,12 +27,12 @@ TEST(hyperloglog, bit_widths) TEST(hyperloglog, initialization) { - size_t const b = 6; - size_t const m = 1 << b; + size_t const bits = 6; + size_t const size = 1 << bits; - seqan::hibf::sketch::hyperloglog sketch(b); + seqan::hibf::sketch::hyperloglog sketch(bits); - EXPECT_EQ(sketch.registerSize(), m); + EXPECT_EQ(sketch.data_size(), size); // No elements were inserted, so the small values correction should be used. // Since there are only zeros in the register, the correction formula should be: @@ -61,7 +61,7 @@ TEST(hyperloglog, add_and_estimate_small) // first 4 bits of hash: 1000, rank: 2 sketch.add(9563173945158404745ULL); - // estimate = alpha * m * m / sum(2^(-M_[j])) + // estimate = alpha * m * m / sum(2^(-data[j])) // = 0.697 * 32 * 32 / (89/8) // = 0.697 * 32 * 32 / 11.125 = 64.155... @@ -71,7 +71,7 @@ TEST(hyperloglog, add_and_estimate_small) EXPECT_NEAR(sketch.estimate(), 7.899522493, 0.0000001); } -TEST(hyperloglog, clear) +TEST(hyperloglog, reset) { // Same as add_and_estimate_small seqan::hibf::sketch::hyperloglog sketch{}; @@ -87,8 +87,8 @@ TEST(hyperloglog, clear) EXPECT_NEAR(sketch.estimate(), 7.899522493, 0.0000001); - // Actual clear test - sketch.clear(); + // Actual reset test + sketch.reset(); EXPECT_EQ(sketch.estimate(), 0.0); } @@ -110,7 +110,7 @@ TEST(hyperloglog, add_and_estimate_large) { seqan::hibf::sketch::hyperloglog sketch{}; - std::unordered_set control; + robin_hood::unordered_flat_set control; for (uint64_t value : input_values) { @@ -123,32 +123,14 @@ TEST(hyperloglog, add_and_estimate_large) EXPECT_NEAR(sketch.estimate(), control.size(), control.size() * 1.04 / 4); } -TEST(hyperloglog, add_and_estimate_small_SIMD) -{ - seqan::hibf::sketch::hyperloglog sketch{}; - - sketch.add(255881241332063154ULL); - sketch.add(13132817195163223578ULL); - sketch.add(5120631300412165844ULL); - sketch.add(16862690419523416066ULL); - sketch.add(148518882728022940ULL); - sketch.add(15892358469365346306ULL); - sketch.add(10885195586503739779ULL); - sketch.add(9563173945158404745ULL); - - seqan::hibf::sketch::hyperloglog other{sketch}; - - EXPECT_NEAR(sketch.merge_and_estimate_SIMD(other), 7.89952249, 0.0000001); -} - -TEST(hyperloglog, merge_and_merge_SIMD) +TEST(hyperloglog, merge) { size_t const chunks{10u}; size_t const chunk_size{(input_values.size() + chunks - 1u) / chunks}; seqan::hibf::sketch::hyperloglog full_sketch{}; seqan::hibf::sketch::hyperloglog merge_sketch{}; - seqan::hibf::sketch::hyperloglog merge_SIMD_sketch{}; + seqan::hibf::sketch::hyperloglog merge_estimate_sketch{}; std::vector partial_sketches; @@ -169,15 +151,15 @@ TEST(hyperloglog, merge_and_merge_SIMD) for (auto & partial_sketch : partial_sketches) { merge_sketch.merge(partial_sketch); - merge_SIMD_sketch.merge_and_estimate_SIMD(partial_sketch); + merge_estimate_sketch.merge_and_estimate(partial_sketch); } // now full_sketch and merged_sketch should be equal EXPECT_EQ(full_sketch.estimate(), merge_sketch.estimate()); - EXPECT_EQ(full_sketch.estimate(), merge_SIMD_sketch.estimate()); + EXPECT_EQ(full_sketch.estimate(), merge_estimate_sketch.estimate()); } -TEST(hyperloglog, fail_dump) +TEST(hyperloglog, failed_store) { seqan::hibf::sketch::hyperloglog sketch{}; std::ofstream ostrm{"hibf_non_existent_outputfile"}; @@ -185,12 +167,12 @@ TEST(hyperloglog, fail_dump) try { - sketch.dump(ostrm); + sketch.store(ostrm); FAIL(); } catch (std::runtime_error const & exception) { - EXPECT_STREQ(exception.what(), "[HyperLogLog] Failed to dump a HyperLogLog sketch to a file."); + EXPECT_STREQ(exception.what(), "[HyperLogLog] Failed to store a HyperLogLog sketch to a file."); } catch (...) { @@ -198,7 +180,7 @@ TEST(hyperloglog, fail_dump) } } -TEST(hyperloglog, fail_restore) +TEST(hyperloglog, failed_load) { seqan::hibf::test::tmp_directory tmp_dir{}; std::filesystem::path file_name{tmp_dir.path() / "sketch.hll"}; @@ -212,12 +194,12 @@ TEST(hyperloglog, fail_restore) try { - sketch.restore(istrm); + sketch.load(istrm); FAIL(); } catch (std::runtime_error const & exception) { - EXPECT_STREQ(exception.what(), "[HyperLogLog] Failed to restore a HyperLogLog sketch from a file: I/O error."); + EXPECT_STREQ(exception.what(), "[HyperLogLog] Failed to load a HyperLogLog sketch from a file: I/O error."); } catch (...) { @@ -225,7 +207,7 @@ TEST(hyperloglog, fail_restore) } } -TEST(hyperloglog, fail_restore_bit_width) +TEST(hyperloglog, failed_load_bit_width) { seqan::hibf::test::tmp_directory tmp_dir{}; std::filesystem::path file_name{tmp_dir.path() / "wrong.hll"}; @@ -239,13 +221,13 @@ TEST(hyperloglog, fail_restore_bit_width) try { - sketch.restore(istrm); + sketch.load(istrm); FAIL(); } catch (std::runtime_error const & exception) { EXPECT_STREQ(exception.what(), - "[HyperLogLog] Failed to restore a HyperLogLog sketch from a file: Invalid bit_width."); + "[HyperLogLog] Failed to load a HyperLogLog sketch from a file: Invalid bit_width."); } catch (...) { @@ -253,26 +235,26 @@ TEST(hyperloglog, fail_restore_bit_width) } } -TEST(hyperloglog, dump_and_restore) +TEST(hyperloglog, store_and_load) { - seqan::hibf::sketch::hyperloglog dump_sketch{}; - seqan::hibf::sketch::hyperloglog restore_sketch{}; + seqan::hibf::sketch::hyperloglog store_sketch{}; + seqan::hibf::sketch::hyperloglog load_sketch{}; for (uint64_t value : input_values) - dump_sketch.add(value); + store_sketch.add(value); // create temp file seqan::hibf::test::tmp_directory tmp_dir{}; - std::filesystem::path dump_filename{tmp_dir.path() / "dump.hll"}; + std::filesystem::path store_filename{tmp_dir.path() / "dump.hll"}; - // dump sketch - std::ofstream ostrm(dump_filename, std::ios::binary); - dump_sketch.dump(ostrm); + // store sketch + std::ofstream ostrm(store_filename, std::ios::binary); + store_sketch.store(ostrm); - // restore sketch - std::ifstream istrm(dump_filename, std::ios::binary); - restore_sketch.restore(istrm); + // load sketch + std::ifstream istrm(store_filename, std::ios::binary); + load_sketch.load(istrm); - // now dump_sketch and restore_sketch should be equal - EXPECT_EQ(dump_sketch.estimate(), restore_sketch.estimate()); + // now store_sketch and load_sketch should be equal + EXPECT_EQ(store_sketch.estimate(), load_sketch.estimate()); } diff --git a/test/unit/hibf/sketch/toolbox_test.cpp b/test/unit/hibf/sketch/toolbox_test.cpp index 7cac9a52..844b9949 100644 --- a/test/unit/hibf/sketch/toolbox_test.cpp +++ b/test/unit/hibf/sketch/toolbox_test.cpp @@ -261,9 +261,9 @@ TEST_F(toolbox_test, cluster_bins) } } -TEST_F(toolbox_test, rearrange_bins) -{ - seqan::hibf::sketch::toolbox::rearrange_bins(test_sketches, test_kmer_counts, test_positions, 0.9, 1); +// TEST_F(toolbox_test, rearrange_bins) +// { +// seqan::hibf::sketch::toolbox::rearrange_bins(test_sketches, test_kmer_counts, test_positions, 0.9, 1); - EXPECT_RANGE_EQ(test_positions, (std::vector{3, 2, 0, 1})); -} +// EXPECT_RANGE_EQ(test_positions, (std::vector{3, 2, 0, 1})); +// }