diff --git a/include/hibf/hierarchical_interleaved_bloom_filter.hpp b/include/hibf/hierarchical_interleaved_bloom_filter.hpp index 123019c1..ca29e4d4 100644 --- a/include/hibf/hierarchical_interleaved_bloom_filter.hpp +++ b/include/hibf/hierarchical_interleaved_bloom_filter.hpp @@ -125,6 +125,17 @@ namespace seqan::hibf class hierarchical_interleaved_bloom_filter { public: + /*!\brief Computes `number_of_user_bins` from `ibf_bin_to_user_bin_id`. + * \todo `number_of_user_bins` can be serialised once RAPTOR_OLD_HIBF is removed. + * \private + */ + void set_number_of_user_bins(); + + /*!\brief The number of user bins. Used for counting. + * \private + */ + size_t number_of_user_bins{}; + /*!\brief Manages membership queries for the seqan::hibf::hierarchical_interleaved_bloom_filter. * \see seqan::hibf::hierarchical_interleaved_bloom_filter::user_bins::filename_of_user_bin * \details @@ -132,6 +143,10 @@ class hierarchical_interleaved_bloom_filter */ class membership_agent_type; + //!\brief Manages counting ranges of values for the seqan::hibf::hierarchical_interleaved_bloom_filter. + template + class counting_agent_type; + /*!\name Constructors, destructor and assignment * \{ */ @@ -206,6 +221,12 @@ class hierarchical_interleaved_bloom_filter //!\brief Returns a membership_agent to be used for counting. membership_agent_type membership_agent() const; + /*!\brief Returns a counting_agent_type to be used for counting. + * \tparam value_t The type to use for the counters; must model std::integral. + */ + template + counting_agent_type counting_agent() const; + /*!\cond DEV * \brief Serialisation support function. * \tparam archive_t Type of `archive`; must satisfy seqan::hibf::cereal_archive. @@ -215,15 +236,28 @@ class hierarchical_interleaved_bloom_filter * \sa https://docs.seqan.de/seqan/3.2.0/group__io.html#serialisation */ template - void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive) + void CEREAL_SAVE_FUNCTION_NAME(archive_t & archive) + { + archive(ibf_vector); + archive(next_ibf_id); +#ifdef RAPTOR_OLD_HIBF // Temporary compatibility with Raptor's HIBF. Also resolve set_number_of_user_bins todo. + std::vector filenames{}; + archive(filenames); +#endif + archive(ibf_bin_to_user_bin_id); + } + + template + void CEREAL_LOAD_FUNCTION_NAME(archive_t & archive) { archive(ibf_vector); archive(next_ibf_id); -#ifdef RAPTOR_OLD_HIBF // Temporary compatibility with Raptor's HIBF. +#ifdef RAPTOR_OLD_HIBF // Temporary compatibility with Raptor's HIBF. Also resolve set_number_of_user_bins todo. std::vector filenames{}; archive(filenames); #endif archive(ibf_bin_to_user_bin_id); + set_number_of_user_bins(); // Resolve set_number_of_user_bins todo. } /*!\name Timer @@ -373,4 +407,126 @@ hierarchical_interleaved_bloom_filter::membership_agent() const return typename hierarchical_interleaved_bloom_filter::membership_agent_type{*this}; } +template +class hierarchical_interleaved_bloom_filter::counting_agent_type +{ +private: + //!\brief A pointer to the augmented hierarchical_interleaved_bloom_filter. + hierarchical_interleaved_bloom_filter const * const hibf_ptr{nullptr}; + + //!\brief Helper for recursive bulk counting. + template + void bulk_count_impl(value_range_t && values, int64_t const ibf_idx, size_t const threshold) + { + auto agent = hibf_ptr->ibf_vector[ibf_idx].template counting_agent(); + auto & result = agent.bulk_count(values); + + value_t sum{}; + + for (size_t bin{}; bin < result.size(); ++bin) + { + sum += result[bin]; + auto const current_filename_index = hibf_ptr->ibf_bin_to_user_bin_id[ibf_idx][bin]; + + if (current_filename_index < 0) // merged bin + { + if (sum >= threshold) + bulk_count_impl(values, hibf_ptr->next_ibf_id[ibf_idx][bin], threshold); + sum = 0u; + } + else if (bin + 1u == result.size() || // last bin + current_filename_index != hibf_ptr->ibf_bin_to_user_bin_id[ibf_idx][bin + 1]) // end of split bin + { + if (sum >= threshold) + result_buffer[current_filename_index] = sum; + sum = 0u; + } + } + } + + //!\brief Stores the result of bulk_count(). + counting_vector result_buffer; + +public: + /*!\name Constructors, destructor and assignment + * \{ + */ + counting_agent_type() = default; //!< Defaulted. + counting_agent_type(counting_agent_type const &) = default; //!< Defaulted. + counting_agent_type & operator=(counting_agent_type const &) = delete; //!< Deleted. hibf_ptr is const. + counting_agent_type(counting_agent_type &&) = default; //!< Defaulted. + counting_agent_type & operator=(counting_agent_type &&) = delete; //!< Deleted. hibf_ptr is const. + ~counting_agent_type() = default; //!< Defaulted. + + /*!\brief Construct a counting_agent_type for an existing hierarchical_interleaved_bloom_filter. + * \private + * \param hibf The hierarchical_interleaved_bloom_filter. + */ + explicit counting_agent_type(hierarchical_interleaved_bloom_filter const & hibf) : + hibf_ptr(std::addressof(hibf)), + result_buffer(hibf_ptr->number_of_user_bins) + {} + //!\} + + /*!\name Counting + * \{ + */ + /*!\brief Counts the occurrences in each bin for all values in a range. + * \tparam value_range_t The type of the range of values. Must model std::ranges::forward_range. The reference type + * must model std::unsigned_integral. + * \param[in] values The range of values to process. + * \param[in] threshold Only determine counts when the count is at least (>=) this high. Must be greater than 0. + * \returns A reference to a seqan::hibf::counting_vector that has a count value for each user bin. + * + * \attention The result of this function must always be bound via reference, e.g. `auto &`, to prevent copying. + * \attention Sequential calls to this function invalidate the previously returned reference. + * + * \details + * + * ### Performance + * + * Providing a threshold can significantly speed up the query as the hierarchical structure + * may avoid to recurse into every part of the HIBF. + * + * Counts that do not exceed the threshold will be reported as `0`. + * + * ### Thread safety + * + * Concurrent invocations of this function are not thread safe, please create a + * seqan::hibf::hierarchical_interleaved_bloom_filter::counting_agent_type for each thread. + */ + template + [[nodiscard]] counting_vector const & bulk_count(value_range_t && values, + size_t const threshold) & noexcept + { + assert(hibf_ptr != nullptr); + assert(threshold > 0u); + assert(result_buffer.size() == hibf_ptr->number_of_user_bins); + + static_assert(std::ranges::forward_range, "The values must model forward_range."); + static_assert(std::unsigned_integral>, + "An individual value must be an unsigned integral."); + + std::ranges::fill(result_buffer, static_cast(0)); + + bulk_count_impl(values, 0, threshold); + + return result_buffer; + } + + // `bulk_count` cannot be called on a temporary, since the object the returned reference points to + // is immediately destroyed. + template + [[nodiscard]] counting_vector const & bulk_count(value_range_t && values, + size_t const threshold) && noexcept = delete; + //!\} +}; + +template +inline hierarchical_interleaved_bloom_filter::counting_agent_type +hierarchical_interleaved_bloom_filter::counting_agent() const +{ + return typename hierarchical_interleaved_bloom_filter::counting_agent_type{*this}; +} + } // namespace seqan::hibf diff --git a/src/hierarchical_interleaved_bloom_filter.cpp b/src/hierarchical_interleaved_bloom_filter.cpp index 53d98f20..a0422d96 100644 --- a/src/hierarchical_interleaved_bloom_filter.cpp +++ b/src/hierarchical_interleaved_bloom_filter.cpp @@ -34,6 +34,14 @@ namespace seqan::hibf { +void hierarchical_interleaved_bloom_filter::set_number_of_user_bins() +{ + int64_t max_bin_id{-1}; + for (auto const & ibf : ibf_bin_to_user_bin_id) + max_bin_id = std::max(max_bin_id, std::ranges::max(ibf)); + number_of_user_bins = static_cast(++max_bin_id); +} + size_t hierarchical_build(hierarchical_interleaved_bloom_filter & hibf, robin_hood::unordered_flat_set & parent_kmers, layout::graph::node const & current_node, @@ -205,7 +213,18 @@ hierarchical_interleaved_bloom_filter::hierarchical_interleaved_bloom_filter(con std::vector sketches{}; sketch::compute_sketches(configuration, kmer_counts, sketches); + // If rearrangement is enabled, i.e. seqan::hibf::config::disable_rearrangement is false: + // `min_id == none` in seqan::hibf::sketch::toolbox::cluster_bins -> std::out_of_range "key not found" + // Otherwise: + // seqan::hibf::interleaved_bloom_filter constructor -> std::logic_error "The size of a bin must be > 0." + assert(std::ranges::none_of(kmer_counts, + [](size_t const count) + { + return count == 0u; + })); + auto layout = layout::compute_layout(configuration, kmer_counts, sketches); + number_of_user_bins = configuration.number_of_user_bins; build_index(*this, configuration, layout); } @@ -214,6 +233,7 @@ hierarchical_interleaved_bloom_filter::hierarchical_interleaved_bloom_filter(con layout::layout const & layout) { configuration.validate_and_set_defaults(); + number_of_user_bins = configuration.number_of_user_bins; build_index(*this, configuration, layout); } diff --git a/test/unit/hibf/hierarchical_interleaved_bloom_filter_test.cpp b/test/unit/hibf/hierarchical_interleaved_bloom_filter_test.cpp index 7b674fb5..7980e1dd 100644 --- a/test/unit/hibf/hierarchical_interleaved_bloom_filter_test.cpp +++ b/test/unit/hibf/hierarchical_interleaved_bloom_filter_test.cpp @@ -36,6 +36,27 @@ TEST(hibf_test, small_example_with_direct_hashes) auto & result = agent.membership_for(query, 2u); agent.sort_results(); EXPECT_RANGE_EQ(result, (std::vector{0u, 1u})); + EXPECT_EQ(config.number_of_user_bins, hibf.number_of_user_bins); +} + +TEST(hibf_test, set_number_of_user_bins) +{ + seqan::hibf::config config{.input_fn = + [&](size_t const, seqan::hibf::insert_iterator it) + { + it = 5u; + }, + .number_of_user_bins = 73}; + + seqan::hibf::hierarchical_interleaved_bloom_filter hibf{config}; + EXPECT_EQ(config.number_of_user_bins, hibf.number_of_user_bins); + + hibf.set_number_of_user_bins(); + EXPECT_EQ(config.number_of_user_bins, hibf.number_of_user_bins); + + hibf.ibf_bin_to_user_bin_id.emplace_back(1u, 73); + hibf.set_number_of_user_bins(); + EXPECT_EQ(config.number_of_user_bins + 1u, hibf.number_of_user_bins); } TEST(hibf_test, build_from_layout) @@ -87,6 +108,7 @@ TEST(hibf_test, build_from_layout) auto & result = agent.membership_for(query, 2u); agent.sort_results(); EXPECT_RANGE_EQ(result, (std::vector{0u, 1u})); + EXPECT_EQ(configuration.number_of_user_bins, hibf.number_of_user_bins); } TEST(hibf_test, three_level_hibf) @@ -130,18 +152,19 @@ TEST(hibf_test, three_level_hibf) EXPECT_EQ(unique_result.size(), 1u); EXPECT_EQ(unique_result[0], ub_id); } + EXPECT_EQ(config.number_of_user_bins, hibf.number_of_user_bins); } -TEST(hibf_test, unevenly_sized_and_unique_user_bins) +// std::pow(number, 2) does conversion to double and back to size_t. +// If used more often, consider porting pow with integer overloads from +// https://github.com/seqan/seqan3/blob/8b7d02cb8695369b7baeb4b3042ae7c864b67b8c/include/seqan3/utility/math.hpp +inline size_t squared(size_t const number) { - // std::pow(number, 2) does conversion do double and back to size_t. - // If used more often, consider porting pow with integer overloads from - // https://github.com/seqan/seqan3/blob/8b7d02cb8695369b7baeb4b3042ae7c864b67b8c/include/seqan3/utility/math.hpp - auto squared = [](size_t const number) - { - return number * number; - }; + return number * number; +} +TEST(hibf_test, unevenly_sized_and_unique_user_bins) +{ // Simulate 500 user bins of exponentially increasing size // No overlap between the user bins happens. seqan::hibf::config config{.input_fn = @@ -171,6 +194,7 @@ TEST(hibf_test, unevenly_sized_and_unique_user_bins) EXPECT_EQ(unique_result.size(), 1u); EXPECT_EQ(unique_result[0], ub_id); } + EXPECT_EQ(config.number_of_user_bins, hibf.number_of_user_bins); } TEST(hibf_test, evenly_sized_and_highly_similar_user_bins) @@ -205,85 +229,69 @@ TEST(hibf_test, evenly_sized_and_highly_similar_user_bins) agent.sort_results(); EXPECT_RANGE_EQ(similar_result, (std::views::iota(ub_id - 6u, ub_id + 1u))); } + EXPECT_EQ(config.number_of_user_bins, hibf.number_of_user_bins); } -// #ifdef HIBF_HAS_SEQAN3 - -// #include -// #include -// #include -// #include -// #include -// #include - -// TEST(hibf_seqan3_test, input_sequences_of_sequences) -// { -// using namespace seqan::hibf::literals; - -// auto kmer_transformation = seqan::hibf::views::kmer_hash(seqan::hibf::ungapped{2u}); - -// // range of range of sequences -// std::vector>> seqs{{"AAAGGGGGGC"_dna4}, {"TTTTTT"_dna4}}; - -// seqan::hibf::config config -// { -// .input = seqs | seqan::hibf::views::deep{kmer_transformation}, -// .rearrange_user_bins = false -// }; - -// seqan::hibf::hierarchical_interleaved_bloom_filter hibf{config}; - -// auto agent = hibf.membership_agent(); - -// std::vector query{"AAGG"_dna4}; -// auto query_kmers = query | kmer_transformation; - -// auto result = agent.membership_for(query_kmers, 1); - -// seqan::hibf::debug_stream << result << std::endl; -// } - -// TEST(hibf_seqan3_test, input_files) -// { -// seqan::hibf::test::tmp_directory tmp{}; -// std::filesystem::path f1{tmp.path() / "f1.fa"}; -// std::filesystem::path f2{tmp.path() / "f2.fa"}; - -// { -// std::ofstream out{f1}; -// out << ">seq1\nAAAGGGGGGC\n"; - -// std::ofstream out2{f2}; -// out2 << ">seq1\nTTTTTT\n"; -// } - -// auto transform = seqan::hibf::views::kmer_hash(seqan::hibf::ungapped{2u}); - -// // range of range of sequences -// std::vector filenames{f1.string(), f2.string()}; -// auto file_range = filenames | std::views::transform([&transform](auto const & f) -// { -// auto record_transform = std::views::transform([&transform](auto && rec){ return rec.sequence() | transform; }); -// return seqan::hibf::detail::all(seqan::hibf::sequence_file_input{f}) | record_transform; -// }); - -// seqan::hibf::config config -// { -// .input = file_range, -// .rearrange_user_bins = false -// }; +TEST(hibf_test, counting_agent_same_bins) +{ + // Simulate 500 bins, with the same 500 elements each + seqan::hibf::config config{.input_fn = + [&](size_t const, seqan::hibf::insert_iterator it) + { + for (size_t i = 0u; i < 500u; ++i) + it = i; + }, + .number_of_user_bins = 500, + .maximum_fpr = 0.001, + .threads = 4, + .tmax = 64, + .disable_estimate_union = true, + .disable_rearrangement = true}; -// seqan::hibf::hierarchical_interleaved_bloom_filter hibf{config}; + seqan::hibf::hierarchical_interleaved_bloom_filter hibf{config}; + auto agent = hibf.template counting_agent(); -// auto agent = hibf.membership_agent(); + auto query = std::views::iota(0u, 500u); + auto & result = agent.bulk_count(query, 1u); -// using namespace seqan::hibf::literals; + for (size_t ub_id = 0u; ub_id < config.number_of_user_bins; ++ub_id) + { + ASSERT_GE(result[ub_id], 500u) << "ub_id: " << ub_id; + // Split bins might have FP + ASSERT_LE(result[ub_id], 502u) << "ub_id: " << ub_id; + } +} -// std::vector query{"AAGG"_dna4}; +TEST(hibf_test, counting_agent_different_bins) +{ + // Simulate 500 user bins of exponentially increasing size + // No overlap between the user bins happens. + seqan::hibf::config config{.input_fn = + [&](size_t const ub_id, seqan::hibf::insert_iterator it) + { + size_t const start = squared(ub_id + 1u); + size_t const end = squared(ub_id + 2u) - 1u; + for (size_t i = start; i < end; ++i) + it = i; + }, + .number_of_user_bins = 500, + .maximum_fpr = 0.001, + .threads = 4, + .tmax = 64, + .disable_estimate_union = true, + .disable_rearrangement = true}; -// auto result = agent.membership_for(query | transform, 1); + seqan::hibf::hierarchical_interleaved_bloom_filter hibf{config}; + auto agent = hibf.template counting_agent(); -// seqan::hibf::debug_stream << result << std::endl; // prints [0] since query is found in user bin 0 -// } + auto query = std::views::iota(1u, 251'000u); // 501^2 - 1 + auto & result = agent.bulk_count(query, 1u); -// #endif // HIBF_HAS_SEQAN3 + for (size_t ub_id = 0u; ub_id < config.number_of_user_bins; ++ub_id) + { + // Hashes are consecutive integer values and are hence a bad input for the HIBF. + // Only checking for minimum count. + size_t const min_count = 2 * ub_id + 2; // size of [ squared(ub_id + 1u), ..., squared(ub_id + 2u) - 1u ) + ASSERT_GE(result[ub_id], min_count) << "ub_id: " << ub_id; + } +}