Skip to content

Commit

Permalink
cpp
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Jul 12, 2024
1 parent bec9e77 commit 4e2cf17
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 55 deletions.
71 changes: 16 additions & 55 deletions include/hibf/sketch/minHashes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
* \brief Provides seqan::hibf::sketch::minHashes.
*/

#include <algorithm>
#pragma once

#include <cstdint>
#include <span>
#include <vector>
Expand All @@ -29,6 +30,13 @@ namespace seqan::hibf::sketch
*/
struct minHashes
{
static constexpr uint64_t register_id_mask{15}; /// ...00001111
static constexpr size_t num_sketches{16};
static constexpr size_t sketch_size{40};

//!\brief A table of sketches. For LSH we need multiple sketches, stored in a table.
std::vector<std::vector<uint64_t>> table{};

/*!\name Constructors, destructor and assignment
* \{
*/
Expand All @@ -38,65 +46,18 @@ struct minHashes
minHashes(minHashes &&) = default; //!< Defaulted.

Check warning on line 46 in include/hibf/sketch/minHashes.hpp

View check run for this annotation

Codecov / codecov/patch

include/hibf/sketch/minHashes.hpp#L46

Added line #L46 was not covered by tests
minHashes & operator=(minHashes &&) = default; //!< Defaulted.
~minHashes() = default; //!< Defaulted.

//!\brief construct from a vector of the smallest values in a set (sorted ascending).
minHashes(std::vector<uint64_t> const & smallest_values)
{
table.resize(num_sketches);

for (auto & elem : table)
elem.reserve(sketch_size);
minHashes(std::vector<uint64_t> const & smallest_values);
//!\}

for (uint64_t const hash : smallest_values)
{
auto & hash_table = table[hash & register_id_mask];
if (hash_table.size() < sketch_size)
hash_table.push_back(hash >> 4);
}
}

static constexpr uint64_t register_id_mask{15}; /// ...00001111
static constexpr size_t num_sketches{16};
static constexpr size_t sketch_size{40};

//!\brief A table of sketches. For LSH we need multiple sketches, stored in a table.
std::vector<std::vector<uint64_t>> table{};

//!\brief Checks whether the minHash table is completely filled
bool is_valid() const
{
return table.size() == num_sketches
&& std::ranges::all_of(table,
[](auto const & minHash_sketch)
{
return minHash_sketch.size() == sketch_size;
});
}
//!\brief Checks whether the minHash table is completely filled.
bool is_valid() const;

//!\brief Adds more minhash values to an existing but incomplete table.
void fill_up_sketches(std::span<uint64_t> const & more_smallest_values)
{
for (uint64_t const hash : more_smallest_values)
{
auto & hash_table = table[hash & register_id_mask];
assert(std::ranges::find(hash_table, hash) == hash_table.end()); // hashes should be unique
if (hash_table.size() < sketch_size)
hash_table.push_back(hash >> 4);
}
}

//!\brief Miscallenious function to update a heap with a new element, when preparing a heap for minHash sketches
static void update_heap_with(std::vector<uint64_t> & heap, uint64_t const k_hash)
{
// Do nothing if k_hash is bigger than the current biggest element in the (max) heap.
if (k_hash >= heap[0])
return;
void fill_up_sketches(std::span<uint64_t> const & more_smallest_values);

// we do not need a guard (hash table) to check for duplications because `kmers` is already a set
std::ranges::pop_heap(heap); // max elements move to end of vector
heap.back() = k_hash; // replace last elements instead of physically popping and pushing
std::ranges::push_heap(heap); // last elements is rearranged in the heap to be pushed
}
//!\brief Miscallenious function to update a heap with a new element, when preparing a heap for minHash sketches.
static void update_heap_with(std::vector<uint64_t> & heap, uint64_t const k_hash);
};

} // namespace seqan::hibf::sketch
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ set (HIBF_SOURCE_FILES
misc/print.cpp
sketch/toolbox.cpp
sketch/hyperloglog.cpp
sketch/minHashes.cpp
build/insert_into_ibf.cpp
build/compute_kmers.cpp
build/construct_ibf.cpp)
Expand Down
61 changes: 61 additions & 0 deletions src/sketch/minHashes.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause

#include <algorithm>
#include <cassert>

#include <hibf/sketch/minHashes.hpp>

namespace seqan::hibf::sketch
{

minHashes::minHashes(std::vector<uint64_t> const & smallest_values)
{
table.resize(num_sketches);

for (auto & elem : table)
elem.reserve(sketch_size);

for (uint64_t const hash : smallest_values)
{
auto & hash_table = table[hash & register_id_mask];
if (hash_table.size() < sketch_size)
hash_table.push_back(hash >> 4);
}
}

bool minHashes::is_valid() const
{
return table.size() == num_sketches
&& std::ranges::all_of(table,
[](auto const & minHash_sketch)
{
return minHash_sketch.size() == sketch_size;
});
}

void minHashes::fill_up_sketches(std::span<uint64_t> const & more_smallest_values)
{
for (uint64_t const hash : more_smallest_values)
{
auto & hash_table = table[hash & register_id_mask];
assert(std::ranges::find(hash_table, hash) == hash_table.end()); // hashes should be unique
if (hash_table.size() < sketch_size)
hash_table.push_back(hash >> 4);
}
}

void minHashes::update_heap_with(std::vector<uint64_t> & heap, uint64_t const k_hash)
{
// Do nothing if k_hash is bigger than the current biggest element in the (max) heap.
if (k_hash >= heap[0])
return;

// we do not need a guard (hash table) to check for duplications because `kmers` is already a set
std::ranges::pop_heap(heap); // max elements move to end of vector
heap.back() = k_hash; // replace last elements instead of physically popping and pushing
std::ranges::push_heap(heap); // last elements is rearranged in the heap to be pushed
}

} // namespace seqan::hibf::sketch

0 comments on commit 4e2cf17

Please sign in to comment.