From 0c0d7a8bf46f8d1f9a877ba6761748a7eabeeb53 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Fri, 6 Sep 2024 15:25:20 +0200 Subject: [PATCH] [MISC] Rename filename_indices and refactor ctor --- include/hibf/build/update_user_bins.hpp | 7 +++-- .../hierarchical_interleaved_bloom_filter.hpp | 21 +++++++-------- src/hierarchical_interleaved_bloom_filter.cpp | 26 +++++++++++-------- 3 files changed, 30 insertions(+), 24 deletions(-) diff --git a/include/hibf/build/update_user_bins.hpp b/include/hibf/build/update_user_bins.hpp index 6a8866c8..e343534e 100644 --- a/include/hibf/build/update_user_bins.hpp +++ b/include/hibf/build/update_user_bins.hpp @@ -16,9 +16,12 @@ namespace seqan::hibf::build /*!\brief Updates user bins stored in HIBF. * \ingroup hibf_build */ -inline void update_user_bins(std::vector & filename_indices, layout::layout::user_bin const & record) +inline void update_user_bins(std::vector & technical_bin_to_user_bin_id, + layout::layout::user_bin const & record) { - std::fill_n(filename_indices.begin() + record.storage_TB_id, record.number_of_technical_bins, record.idx); + std::fill_n(technical_bin_to_user_bin_id.begin() + record.storage_TB_id, + record.number_of_technical_bins, + record.idx); } } // namespace seqan::hibf::build diff --git a/include/hibf/hierarchical_interleaved_bloom_filter.hpp b/include/hibf/hierarchical_interleaved_bloom_filter.hpp index f06cb33a..bc654889 100644 --- a/include/hibf/hierarchical_interleaved_bloom_filter.hpp +++ b/include/hibf/hierarchical_interleaved_bloom_filter.hpp @@ -141,7 +141,6 @@ class hierarchical_interleaved_bloom_filter size_t number_of_user_bins{}; /*!\brief Manages membership queries for the seqan::hibf::hierarchical_interleaved_bloom_filter. - * \see seqan::hibf::hierarchical_interleaved_bloom_filter::user_bins::filename_of_user_bin * \details * In contrast to the seqan::hibf::interleaved_bloom_filter, the result will consist of indices of user bins. */ @@ -288,19 +287,19 @@ class hierarchical_interleaved_bloom_filter::membership_agent_type { sum += result[bin]; - auto const current_filename_index = hibf_ptr->ibf_bin_to_user_bin_id[ibf_idx][bin]; + auto const user_bin_id = hibf_ptr->ibf_bin_to_user_bin_id[ibf_idx][bin]; - if (current_filename_index == bin_kind::merged) // merged bin + if (user_bin_id == bin_kind::merged) // merged bin { if (sum >= threshold) membership_for_impl(values, hibf_ptr->next_ibf_id[ibf_idx][bin], threshold); sum = 0u; } - else if (bin + 1u == result.size() || // last bin - current_filename_index != hibf_ptr->ibf_bin_to_user_bin_id[ibf_idx][bin + 1]) // end of split bin + else if (bin + 1u == result.size() || // last bin + user_bin_id != hibf_ptr->ibf_bin_to_user_bin_id[ibf_idx][bin + 1]) // end of split bin { if (sum >= threshold) - result_buffer.emplace_back(current_filename_index); + result_buffer.emplace_back(user_bin_id); sum = 0u; } } @@ -423,19 +422,19 @@ class hierarchical_interleaved_bloom_filter::counting_agent_type for (size_t bin{}; bin < result.size(); ++bin) { sum += result[bin]; - auto const current_filename_index = hibf_ptr->ibf_bin_to_user_bin_id[ibf_idx][bin]; + auto const user_bin_id = hibf_ptr->ibf_bin_to_user_bin_id[ibf_idx][bin]; - if (current_filename_index == bin_kind::merged) // merged bin + if (user_bin_id == bin_kind::merged) // merged bin { if (sum >= threshold) bulk_count_impl(values, hibf_ptr->next_ibf_id[ibf_idx][bin], threshold); sum = 0u; } - else if (bin + 1u == result.size() || // last bin - current_filename_index != hibf_ptr->ibf_bin_to_user_bin_id[ibf_idx][bin + 1]) // end of split bin + else if (bin + 1u == result.size() || // last bin + user_bin_id != hibf_ptr->ibf_bin_to_user_bin_id[ibf_idx][bin + 1]) // end of split bin { if (sum >= threshold) - result_buffer[current_filename_index] = sum; + result_buffer[user_bin_id] = sum; sum = 0u; } } diff --git a/src/hierarchical_interleaved_bloom_filter.cpp b/src/hierarchical_interleaved_bloom_filter.cpp index 6142bc56..fbfe28d7 100644 --- a/src/hierarchical_interleaved_bloom_filter.cpp +++ b/src/hierarchical_interleaved_bloom_filter.cpp @@ -46,8 +46,16 @@ size_t hierarchical_build(hierarchical_interleaved_bloom_filter & hibf, { size_t const ibf_pos{data.request_ibf_idx()}; - std::vector ibf_positions(current_node.number_of_technical_bins, ibf_pos); - std::vector filename_indices(current_node.number_of_technical_bins, bin_kind::merged); + auto & next_ibf_id = hibf.next_ibf_id[ibf_pos]; + assert(next_ibf_id.empty()); + next_ibf_id.resize(current_node.number_of_technical_bins, ibf_pos); + + auto & technical_bin_to_user_bin_id = hibf.ibf_bin_to_user_bin_id[ibf_pos]; + assert(technical_bin_to_user_bin_id.empty()); + technical_bin_to_user_bin_id.resize(current_node.number_of_technical_bins, bin_kind::merged); + + auto & ibf = hibf.ibf_vector[ibf_pos]; + robin_hood::unordered_flat_set kmers{}; auto initialise_max_bin_kmers = [&]() -> size_t @@ -55,7 +63,7 @@ size_t hierarchical_build(hierarchical_interleaved_bloom_filter & hibf, if (current_node.max_bin_is_merged()) { // recursively initialize favourite child first - ibf_positions[current_node.max_bin_index] = + next_ibf_id[current_node.max_bin_index] = hierarchical_build(hibf, kmers, current_node.children[current_node.favourite_child_idx.value()], @@ -68,7 +76,7 @@ size_t hierarchical_build(hierarchical_interleaved_bloom_filter & hibf, // we assume that the max record is at the beginning of the list of remaining records. auto const & record = current_node.remaining_records[0]; build::compute_kmers(kmers, data, record); - build::update_user_bins(filename_indices, record); + build::update_user_bins(technical_bin_to_user_bin_id, record); return record.number_of_technical_bins; } @@ -76,7 +84,7 @@ size_t hierarchical_build(hierarchical_interleaved_bloom_filter & hibf, // initialize lower level IBF size_t const max_bin_tbs = initialise_max_bin_kmers(); - auto && ibf = construct_ibf(parent_kmers, kmers, max_bin_tbs, current_node, data, is_root); + ibf = construct_ibf(parent_kmers, kmers, max_bin_tbs, current_node, data, is_root); kmers.clear(); // reduce memory peak // parse all other children (merged bins) of the current ibf @@ -121,7 +129,7 @@ size_t hierarchical_build(hierarchical_interleaved_bloom_filter & hibf, { size_t const mutex_id{parent_bin_index / 64}; std::lock_guard guard{local_ibf_mutex[mutex_id]}; - ibf_positions[parent_bin_index] = ibf_pos; + next_ibf_id[parent_bin_index] = ibf_pos; build::insert_into_ibf(kmers, 1, parent_bin_index, ibf, data.fill_ibf_timer); if (!is_root) build::update_parent_kmers(parent_kmers, kmers, data.merge_kmers_timer); @@ -153,14 +161,10 @@ size_t hierarchical_build(hierarchical_interleaved_bloom_filter & hibf, build::update_parent_kmers(parent_kmers, kmers, data.merge_kmers_timer); } - build::update_user_bins(filename_indices, record); + build::update_user_bins(technical_bin_to_user_bin_id, record); kmers.clear(); } - hibf.ibf_vector[ibf_pos] = std::move(ibf); - hibf.next_ibf_id[ibf_pos] = std::move(ibf_positions); - hibf.ibf_bin_to_user_bin_id[ibf_pos] = std::move(filename_indices); - return ibf_pos; }