Skip to content

Commit

Permalink
review
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Jul 12, 2024
1 parent 5d7acc9 commit 82e9039
Show file tree
Hide file tree
Showing 6 changed files with 191 additions and 142 deletions.
67 changes: 33 additions & 34 deletions include/hibf/sketch/minHashes.hpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
// SPDX-FileCopyrightText: 2006-2023, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2023, Knut Reinert & MPI für molekulare Genetik
// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause

/*!\file
* \author Svenja Mehringer <svenja.mehringer AT fu-berlin.de>
* \brief Provides seqan::hibf::sketch::minHashes.
*/

#include <algorithm>
#include <span>
#include <vector>

Expand Down Expand Up @@ -36,64 +37,62 @@ struct minHashes
~minHashes() = default; //!< Defaulted.

//!\brief construct from a vector of the smallest values in a set (sorted ascending).
minHashes(std::vector<uint64_t> const & smalles_values)
minHashes(std::vector<uint64_t> const & smallest_values)
{
table.resize(num_sketches);

for (size_t i = 0; i < num_sketches; ++i)
{
table[i].reserve(sketch_size);
}
for (auto & elem : table)
elem.reserve(sketch_size);

for (uint64_t const hash : smalles_values)
for (uint64_t const hash : smallest_values)
{
uint64_t const register_id{hash & register_id_mask};
if (table[register_id].size() < sketch_size)
table[register_id].push_back(hash >> 4);
auto & hash_table = table[hash & register_id_mask];
if (hash_table.size() < sketch_size)
hash_table.push_back(hash >> 4);
}
}

static inline uint64_t const register_id_mask{15}; /// ...00001111
static inline size_t const num_sketches{16};
static inline size_t const sketch_size{40};
static constexpr uint64_t register_id_mask{15}; /// ...00001111
static constexpr size_t num_sketches{16};
static constexpr size_t sketch_size{40};

//!\brief A table of sketches. For LSH we need multiple sketches, stored in a table.
std::vector<std::vector<uint64_t>> table{};

//!\brief Checks whether the minHash table is completely filled
bool is_valid() const
{
bool result{table.size() == num_sketches};
for (auto const & minHash_sketch : table)
{
if (minHash_sketch.size() < sketch_size)
result = false;
}
return result;
return table.size() == num_sketches
&& std::ranges::all_of(table,
[](auto const & minHash_sketch)
{
return minHash_sketch.size() == sketch_size;
});
}

//!\brief Adds more minhash values to an existing but incomplete table.
void fill_up_sketches(std::span<uint64_t> const & more_smalles_values)
void fill_up_sketches(std::span<uint64_t> const & more_smallest_values)
{
for (uint64_t const hash : more_smalles_values)
for (uint64_t const hash : more_smallest_values)
{
uint64_t const register_id{hash & register_id_mask};
assert(table[register_id].find(hash) == table[register_id].end()); // hashes should be unique
if (table[register_id].size() < sketch_size)
table[register_id].push_back(hash >> 4);
auto & hash_table = table[hash & register_id_mask];
assert(std::ranges::find(hash_table, hash) == hash_table.end()); // hashes should be unique
if (hash_table.size() < sketch_size)
hash_table.push_back(hash >> 4);
}
}

//!\brief Miscallenious function to update a heap with a new element, when preparing a heap for minHash sketches
static void update_heap_with(std::vector<uint64_t> & heap, uint64_t const k_hash)
{
if (k_hash < heap[0]) // 0 because it is a max heap
{
// we do not need a guard (hash table) to check for duplications because `kmers` is already a set
std::pop_heap(heap.begin(), heap.end()); // max elements move to end of vector
heap.back() = k_hash; // replace last elements instead of physically popping and pushing
std::push_heap(heap.begin(), heap.end()); // last elements is rearranged in the heap to be pushed
}
// Do nothing if k_hash is bigger than the current biggest element in the (max) heap.
if (k_hash >= heap[0])
return;

// we do not need a guard (hash table) to check for duplications because `kmers` is already a set
std::ranges::pop_heap(heap); // max elements move to end of vector
heap.back() = k_hash; // replace last elements instead of physically popping and pushing
std::ranges::push_heap(heap); // last elements is rearranged in the heap to be pushed
}
};

Expand Down
2 changes: 1 addition & 1 deletion test/include/hibf/test/expect_range_eq.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ void PrintTo(t const & value, std::ostream * out)
namespace seqan::hibf::test
{

#define EXPECT_RANGE_EQ(val1, val2) EXPECT_PRED_FORMAT2(::seqan::hibf::test::expect_range_eq{}, val1, val2);
#define EXPECT_RANGE_EQ(val1, val2) EXPECT_PRED_FORMAT2(::seqan::hibf::test::expect_range_eq{}, val1, val2)

struct expect_range_eq
{
Expand Down
1 change: 1 addition & 0 deletions test/performance/sketch/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
# SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
# SPDX-License-Identifier: BSD-3-Clause

hibf_benchmark (compute_sketches_benchmark.cpp)
hibf_benchmark (hyperloglog_benchmark.cpp)
49 changes: 49 additions & 0 deletions test/performance/sketch/compute_sketches_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause

#include <benchmark/benchmark.h>

#include <hibf/sketch/compute_sketches.hpp>

enum class sketch : uint8_t
{
Hyperloglog,
MinHashes
};

template <sketch sketch_t>
void compute_sketches(benchmark::State & state)
{
auto create_hashes = [&](size_t const ub_id, seqan::hibf::insert_iterator it)
{
// 0 = [0, 10000]
// 1 = [10000, 20000]
// 1 = [20000, 30000]
for (size_t i = ub_id * 10000; i < (ub_id + 1) * 10000; ++i)
it = i;
};

std::vector<size_t> kmer_counts;
[[maybe_unused]] std::vector<seqan::hibf::sketch::minHashes> minHash_sketches;
std::vector<seqan::hibf::sketch::hyperloglog> hyperloglog_sketches;

seqan::hibf::config config{};
config.number_of_user_bins = 16;
config.input_fn = create_hashes;
config.sketch_bits = 12;

for (auto _ : state)
{
if constexpr (sketch_t == sketch::MinHashes)
seqan::hibf::sketch::compute_sketches_with_minhash(config,
kmer_counts,
hyperloglog_sketches,
minHash_sketches);
else
seqan::hibf::sketch::compute_sketches(config, kmer_counts, hyperloglog_sketches);
}
}

BENCHMARK_TEMPLATE(compute_sketches, sketch::Hyperloglog);
BENCHMARK_TEMPLATE(compute_sketches, sketch::MinHashes);
131 changes: 67 additions & 64 deletions test/unit/hibf/sketch/compute_sketches_test.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// SPDX-FileCopyrightText: 2006-2023, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2023, Knut Reinert & MPI für molekulare Genetik
// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause

#include <gtest/gtest.h> // for Test, Message, TestPartResult, EXPECT_EQ, TestInfo, ASSERT_EQ
Expand All @@ -10,80 +10,83 @@
#include <vector> // for allocator, vector

#include <hibf/sketch/compute_sketches.hpp>
#include <hibf/test/expect_range_eq.hpp> // for expect_range_eq, EXPECT_RANGE_EQ

TEST(compute_sketches_test, hyperloglog_and_kmer_counts)
class compute_sketches_test : public ::testing::Test
{
auto create_hashes = [&](size_t const ub_id, seqan::hibf::insert_iterator it)
{
// 0 = [0, 10000]
// 1 = [10000, 20000]
// 1 = [20000, 30000]
for (size_t i = ub_id * 10000; i < (ub_id + 1) * 10000; ++i)
it = i;
};

public:
std::vector<size_t> kmer_counts;
std::vector<seqan::hibf::sketch::hyperloglog> sketches;

seqan::hibf::config config{};
config.number_of_user_bins = 3;
config.input_fn = create_hashes;
config.sketch_bits = 12;

seqan::hibf::sketch::compute_sketches(config, kmer_counts, sketches);

ASSERT_EQ(kmer_counts.size(), 3);
ASSERT_EQ(sketches.size(), 3);

EXPECT_EQ(kmer_counts[0], 18405);
EXPECT_EQ(kmer_counts[1], 18429);
EXPECT_EQ(kmer_counts[2], 18427);

EXPECT_DOUBLE_EQ(sketches[0].estimate(), 18405.11680625227);
EXPECT_DOUBLE_EQ(sketches[1].estimate(), 18429.448850770688);
EXPECT_DOUBLE_EQ(sketches[2].estimate(), 18427.74236590719);
}

TEST(compute_sketches_test, with_minHash)
{
auto create_hashes = [&](size_t const ub_id, seqan::hibf::insert_iterator it)
{
// 0 = [0, 10000]
// 1 = [10000, 20000]
// 1 = [20000, 30000]
for (size_t i = ub_id * 10000; i < (ub_id + 1) * 10000; ++i)
it = i;
};

std::vector<seqan::hibf::sketch::minHashes> minHash_sketches;
std::vector<seqan::hibf::sketch::hyperloglog> hyperloglog_sketches;
std::vector<seqan::hibf::sketch::minHashes> minHash_sketches;
seqan::hibf::config config;

seqan::hibf::config config{};
config.number_of_user_bins = 3;
config.input_fn = create_hashes;
config.sketch_bits = 12;

seqan::hibf::sketch::compute_sketches_with_minhash(config, hyperloglog_sketches, minHash_sketches);

ASSERT_EQ(minHash_sketches.size(), 3);
ASSERT_EQ(hyperloglog_sketches.size(), 3);
void SetUp() override
{
config.number_of_user_bins = 3;
config.sketch_bits = 12;
config.input_fn = [&](size_t const ub_id, seqan::hibf::insert_iterator it)
{
// 0 = [0, 10000]
// 1 = [10000, 20000]
// 1 = [20000, 30000]
for (size_t i = ub_id * 10000; i < (ub_id + 1) * 10000; ++i)
it = i;
};
}

EXPECT_DOUBLE_EQ(hyperloglog_sketches[0].estimate(), 18405.11680625227);
EXPECT_DOUBLE_EQ(hyperloglog_sketches[1].estimate(), 18429.448850770688);
EXPECT_DOUBLE_EQ(hyperloglog_sketches[2].estimate(), 18427.74236590719);
void check_kmer_counts()
{
ASSERT_EQ(kmer_counts.size(), 3);
EXPECT_EQ(kmer_counts[0], 18405);
EXPECT_EQ(kmer_counts[1], 18429);
EXPECT_EQ(kmer_counts[2], 18427);
}

std::vector<size_t> const start{0, 625, 1250};
for (size_t ub_id = 0; ub_id < config.number_of_user_bins; ++ub_id)
void check_hyperloglog_sketches()
{
ASSERT_TRUE(minHash_sketches[ub_id].is_valid());
ASSERT_EQ(hyperloglog_sketches.size(), 3);
constexpr double abs_error = 0.00001;
EXPECT_NEAR(hyperloglog_sketches[0].estimate(), 18405.11680625227, abs_error);
EXPECT_NEAR(hyperloglog_sketches[1].estimate(), 18429.448850770688, abs_error);
EXPECT_NEAR(hyperloglog_sketches[2].estimate(), 18427.74236590719, abs_error);
}

for (size_t sidx = 0; sidx < seqan::hibf::sketch::minHashes::num_sketches; ++sidx)
void check_minHash_sketches()
{
ASSERT_EQ(minHash_sketches.size(), 3);
size_t start{};
size_t ub_id{};
for (auto & minHash : minHash_sketches)
{
for (size_t hidx = 0; hidx < seqan::hibf::sketch::minHashes::sketch_size; ++hidx)
ASSERT_TRUE(minHash.is_valid()) << "ub_id: " << ub_id;
size_t sid{};
for (auto & sketch : minHash.table)
{
EXPECT_EQ(minHash_sketches[ub_id].table[sidx][hidx], start[ub_id] + hidx)
<< "ub_id:" << ub_id << " sidx:" << sidx << " hidx:" << hidx;
ASSERT_EQ(sketch.size(), seqan::hibf::sketch::minHashes::sketch_size)
<< "ub_id: " << ub_id << "sid: " << sid;
EXPECT_RANGE_EQ(sketch, std::views::iota(start, start + seqan::hibf::sketch::minHashes::sketch_size)) << "ub_id: " << ub_id << "sid: " << sid;
++sid;
}
start += 625;
++ub_id;
}
}
};

TEST_F(compute_sketches_test, hyperloglog_and_kmer_counts)
{
seqan::hibf::sketch::compute_sketches(this->config, this->kmer_counts, this->hyperloglog_sketches);

this->check_kmer_counts();
this->check_hyperloglog_sketches();
}

TEST_F(compute_sketches_test, with_minHash)
{
seqan::hibf::sketch::compute_sketches_with_minhash(this->config,
this->hyperloglog_sketches,
this->minHash_sketches);

this->check_hyperloglog_sketches();
this->check_minHash_sketches();
}
Loading

0 comments on commit 82e9039

Please sign in to comment.