diff --git a/include/hibf/layout/compute_layout.hpp b/include/hibf/layout/compute_layout.hpp index 37652e24..b7ece6d8 100644 --- a/include/hibf/layout/compute_layout.hpp +++ b/include/hibf/layout/compute_layout.hpp @@ -10,17 +10,15 @@ namespace seqan::hibf::layout { -/*!\brief Computes the layout and stores the kmer_counts and sketches in the respective vectors for further use. +/*!\brief Computes the layout. * \ingroup hibf_layout - * \param config The configuration to compute the layout with. - * \param[in,out] kmer_counts The vector that will store the kmer counts (estimations). - * \param[in,out] sketches The vector that will store the sketches. + * \param[in] config The configuration to compute the layout with. + * \param[in] kmer_counts The vector that will store the kmer counts (estimations). + * \param[in] sketches The vector that will store the sketches. * \returns layout */ -layout -compute_layout(config const & config, std::vector & kmer_counts, std::vector & sketches); - -//!\overload -layout compute_layout(config const & config); +layout compute_layout(config const & config, + std::vector const & kmer_counts, + std::vector const & sketches); } // namespace seqan::hibf::layout diff --git a/include/hibf/sketch/compute_sketches.hpp b/include/hibf/sketch/compute_sketches.hpp new file mode 100644 index 00000000..bb610bf5 --- /dev/null +++ b/include/hibf/sketch/compute_sketches.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include // for size_t +#include // for vector + +#include // for config +#include // for hyperloglog + +namespace seqan::hibf::sketch +{ + +/*!\brief Computes the kmer_counts and sketches and stores them in the respective vectors for further use. + * \ingroup hibf_layout + * \param[in] config The configuration to compute the layout with. + * \param[in,out] kmer_counts The vector that will store the kmer counts (estimations). + * \param[in,out] sketches The vector that will store the sketches. + */ +void compute_sketches(config const & config, + std::vector & kmer_counts, + std::vector & sketches); + +} // namespace seqan::hibf::sketch diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 234b9358..21ade80f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -6,6 +6,7 @@ set (HIBF_SOURCE_FILES layout/layout.cpp layout/compute_fpr_correction.cpp layout/compute_layout.cpp + sketch/compute_sketches.cpp layout/graph.cpp layout/hierarchical_binning.cpp misc/print.cpp diff --git a/src/hierarchical_interleaved_bloom_filter.cpp b/src/hierarchical_interleaved_bloom_filter.cpp index b30bef75..d7c8df3b 100644 --- a/src/hierarchical_interleaved_bloom_filter.cpp +++ b/src/hierarchical_interleaved_bloom_filter.cpp @@ -30,6 +30,7 @@ #include // for graph #include // for layout #include // for timer +#include // for compute_sketches namespace seqan::hibf { @@ -197,7 +198,12 @@ void build_index(hierarchical_interleaved_bloom_filter & hibf, hierarchical_interleaved_bloom_filter::hierarchical_interleaved_bloom_filter(config & configuration) { configuration.validate_and_set_defaults(); - auto layout = layout::compute_layout(configuration); + + std::vector kmer_counts{}; + std::vector sketches{}; + sketch::compute_sketches(configuration, kmer_counts, sketches); + + auto layout = layout::compute_layout(configuration, kmer_counts, sketches); build_index(*this, configuration, layout); } diff --git a/src/layout/compute_layout.cpp b/src/layout/compute_layout.cpp index 9f669fea..c316a315 100644 --- a/src/layout/compute_layout.cpp +++ b/src/layout/compute_layout.cpp @@ -20,8 +20,9 @@ namespace seqan::hibf::layout { -layout -compute_layout(config const & config, std::vector & kmer_counts, std::vector & sketches) +layout compute_layout(config const & config, + std::vector const & kmer_counts, + std::vector const & sketches) { layout resulting_layout{}; @@ -30,28 +31,6 @@ compute_layout(config const & config, std::vector & kmer_counts, std::ve std::stringstream output_buffer; std::stringstream header_buffer; - // compute sketches - sketches.resize(config.number_of_user_bins); - kmer_counts.resize(config.number_of_user_bins); - - robin_hood::unordered_flat_set kmers; -#pragma omp parallel for schedule(dynamic) num_threads(config.threads) private(kmers) - for (size_t i = 0; i < config.number_of_user_bins; ++i) - { - seqan::hibf::sketch::hyperloglog sketch(config.sketch_bits); - - kmers.clear(); - config.input_fn(i, std::inserter(kmers, kmers.begin())); - - for (auto k_hash : kmers) - sketch.add(k_hash); - - // #pragma omp critical - sketches[i] = sketch; - } - - sketch::estimate_kmer_counts(sketches, kmer_counts); - data_store store{.false_positive_rate = config.maximum_false_positive_rate, .hibf_layout = &resulting_layout, .kmer_counts = std::addressof(kmer_counts), @@ -74,12 +53,4 @@ compute_layout(config const & config, std::vector & kmer_counts, std::ve return *store.hibf_layout; } -layout compute_layout(config const & config) -{ - std::vector kmer_counts{}; - std::vector sketches{}; - - return compute_layout(config, kmer_counts, sketches); -} - } // namespace seqan::hibf::layout diff --git a/src/sketch/compute_sketches.cpp b/src/sketch/compute_sketches.cpp new file mode 100644 index 00000000..b076eb3c --- /dev/null +++ b/src/sketch/compute_sketches.cpp @@ -0,0 +1,44 @@ +#include // for __sort_fn, sort +#include // for uint64_t +#include // for size_t +#include // for identity, function +#include // for inserter +#include // for basic_stringstream, stringstream +#include // for addressof +#include // for vector + +#include // for unordered_flat_set +#include // for compute_sketches +#include // for estimate_kmer_counts + +namespace seqan::hibf::sketch +{ + +void compute_sketches(config const & config, + std::vector & kmer_counts, + std::vector & sketches) +{ + // compute sketches + sketches.resize(config.number_of_user_bins); + kmer_counts.resize(config.number_of_user_bins); + + robin_hood::unordered_flat_set kmers; +#pragma omp parallel for schedule(dynamic) num_threads(config.threads) private(kmers) + for (size_t i = 0; i < config.number_of_user_bins; ++i) + { + seqan::hibf::sketch::hyperloglog sketch(config.sketch_bits); + + kmers.clear(); + config.input_fn(i, std::inserter(kmers, kmers.begin())); + + for (auto k_hash : kmers) + sketch.add(k_hash); + + // #pragma omp critical + sketches[i] = sketch; + } + + sketch::estimate_kmer_counts(sketches, kmer_counts); +} + +} // namespace seqan::hibf::sketch