Skip to content

Commit

Permalink
[FEATURE] separate sketching and layouting. (#129)
Browse files Browse the repository at this point in the history
* [FEATURE] separate sketching and layouting.

* [MISC] automatic linting

* fix

* doc

* [MISC] automatic linting

---------

Co-authored-by: seqan-actions[bot] <[email protected]>
  • Loading branch information
smehringer and seqan-actions authored Oct 12, 2023
1 parent 49c2bd5 commit 570264f
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 42 deletions.
16 changes: 7 additions & 9 deletions include/hibf/layout/compute_layout.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,15 @@
namespace seqan::hibf::layout
{

/*!\brief Computes the layout and stores the kmer_counts and sketches in the respective vectors for further use.
/*!\brief Computes the layout.
* \ingroup hibf_layout
* \param config The configuration to compute the layout with.
* \param[in,out] kmer_counts The vector that will store the kmer counts (estimations).
* \param[in,out] sketches The vector that will store the sketches.
* \param[in] config The configuration to compute the layout with.
* \param[in] kmer_counts The vector that will store the kmer counts (estimations).
* \param[in] sketches The vector that will store the sketches.
* \returns layout
*/
layout
compute_layout(config const & config, std::vector<size_t> & kmer_counts, std::vector<sketch::hyperloglog> & sketches);

//!\overload
layout compute_layout(config const & config);
layout compute_layout(config const & config,
std::vector<size_t> const & kmer_counts,
std::vector<sketch::hyperloglog> const & sketches);

} // namespace seqan::hibf::layout
22 changes: 22 additions & 0 deletions include/hibf/sketch/compute_sketches.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#pragma once

#include <cstddef> // for size_t
#include <vector> // for vector

#include <hibf/config.hpp> // for config
#include <hibf/sketch/hyperloglog.hpp> // for hyperloglog

namespace seqan::hibf::sketch
{

/*!\brief Computes the kmer_counts and sketches and stores them in the respective vectors for further use.
* \ingroup hibf_layout
* \param[in] config The configuration to compute the layout with.
* \param[in,out] kmer_counts The vector that will store the kmer counts (estimations).
* \param[in,out] sketches The vector that will store the sketches.
*/
void compute_sketches(config const & config,
std::vector<size_t> & kmer_counts,
std::vector<sketch::hyperloglog> & sketches);

} // namespace seqan::hibf::sketch
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ set (HIBF_SOURCE_FILES
layout/layout.cpp
layout/compute_fpr_correction.cpp
layout/compute_layout.cpp
sketch/compute_sketches.cpp
layout/graph.cpp
layout/hierarchical_binning.cpp
misc/print.cpp
Expand Down
8 changes: 7 additions & 1 deletion src/hierarchical_interleaved_bloom_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include <hibf/layout/graph.hpp> // for graph
#include <hibf/layout/layout.hpp> // for layout
#include <hibf/misc/timer.hpp> // for timer
#include <hibf/sketch/compute_sketches.hpp> // for compute_sketches

namespace seqan::hibf
{
Expand Down Expand Up @@ -197,7 +198,12 @@ void build_index(hierarchical_interleaved_bloom_filter & hibf,
hierarchical_interleaved_bloom_filter::hierarchical_interleaved_bloom_filter(config & configuration)
{
configuration.validate_and_set_defaults();
auto layout = layout::compute_layout(configuration);

std::vector<size_t> kmer_counts{};
std::vector<sketch::hyperloglog> sketches{};
sketch::compute_sketches(configuration, kmer_counts, sketches);

auto layout = layout::compute_layout(configuration, kmer_counts, sketches);
build_index(*this, configuration, layout);
}

Expand Down
35 changes: 3 additions & 32 deletions src/layout/compute_layout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@
namespace seqan::hibf::layout
{

layout
compute_layout(config const & config, std::vector<size_t> & kmer_counts, std::vector<sketch::hyperloglog> & sketches)
layout compute_layout(config const & config,
std::vector<size_t> const & kmer_counts,
std::vector<sketch::hyperloglog> const & sketches)
{
layout resulting_layout{};

Expand All @@ -30,28 +31,6 @@ compute_layout(config const & config, std::vector<size_t> & kmer_counts, std::ve
std::stringstream output_buffer;
std::stringstream header_buffer;

// compute sketches
sketches.resize(config.number_of_user_bins);
kmer_counts.resize(config.number_of_user_bins);

robin_hood::unordered_flat_set<uint64_t> kmers;
#pragma omp parallel for schedule(dynamic) num_threads(config.threads) private(kmers)
for (size_t i = 0; i < config.number_of_user_bins; ++i)
{
seqan::hibf::sketch::hyperloglog sketch(config.sketch_bits);

kmers.clear();
config.input_fn(i, std::inserter(kmers, kmers.begin()));

for (auto k_hash : kmers)
sketch.add(k_hash);

// #pragma omp critical
sketches[i] = sketch;
}

sketch::estimate_kmer_counts(sketches, kmer_counts);

data_store store{.false_positive_rate = config.maximum_false_positive_rate,
.hibf_layout = &resulting_layout,
.kmer_counts = std::addressof(kmer_counts),
Expand All @@ -74,12 +53,4 @@ compute_layout(config const & config, std::vector<size_t> & kmer_counts, std::ve
return *store.hibf_layout;
}

layout compute_layout(config const & config)
{
std::vector<size_t> kmer_counts{};
std::vector<sketch::hyperloglog> sketches{};

return compute_layout(config, kmer_counts, sketches);
}

} // namespace seqan::hibf::layout
44 changes: 44 additions & 0 deletions src/sketch/compute_sketches.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#include <algorithm> // for __sort_fn, sort
#include <cinttypes> // for uint64_t
#include <cstddef> // for size_t
#include <functional> // for identity, function
#include <iterator> // for inserter
#include <sstream> // for basic_stringstream, stringstream
#include <utility> // for addressof
#include <vector> // for vector

#include <hibf/contrib/robin_hood.hpp> // for unordered_flat_set
#include <hibf/sketch/compute_sketches.hpp> // for compute_sketches
#include <hibf/sketch/estimate_kmer_counts.hpp> // for estimate_kmer_counts

namespace seqan::hibf::sketch
{

void compute_sketches(config const & config,
std::vector<size_t> & kmer_counts,
std::vector<sketch::hyperloglog> & sketches)
{
// compute sketches
sketches.resize(config.number_of_user_bins);
kmer_counts.resize(config.number_of_user_bins);

robin_hood::unordered_flat_set<uint64_t> kmers;
#pragma omp parallel for schedule(dynamic) num_threads(config.threads) private(kmers)
for (size_t i = 0; i < config.number_of_user_bins; ++i)
{
seqan::hibf::sketch::hyperloglog sketch(config.sketch_bits);

kmers.clear();
config.input_fn(i, std::inserter(kmers, kmers.begin()));

for (auto k_hash : kmers)
sketch.add(k_hash);

// #pragma omp critical
sketches[i] = sketch;
}

sketch::estimate_kmer_counts(sketches, kmer_counts);
}

} // namespace seqan::hibf::sketch

1 comment on commit 570264f

@vercel
Copy link

@vercel vercel bot commented on 570264f Oct 12, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Successfully deployed to the following URLs:

hibf – ./

hibf.vercel.app
hibf-git-main-seqan.vercel.app
hibf-seqan.vercel.app

Please sign in to comment.