Skip to content

Commit

Permalink
[FEATURE] Add hibf counting interface
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Oct 27, 2023
1 parent 4589dca commit 6b6dfc8
Show file tree
Hide file tree
Showing 2 changed files with 196 additions and 81 deletions.
132 changes: 132 additions & 0 deletions include/hibf/hierarchical_interleaved_bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,10 @@ class hierarchical_interleaved_bloom_filter
*/
class membership_agent_type;

//!\brief Manages counting ranges of values for the seqan::hibf::hierarchical_interleaved_bloom_filter.
template <std::integral value_t>
class counting_agent_type;

/*!\name Constructors, destructor and assignment
* \{
*/
Expand Down Expand Up @@ -217,6 +221,12 @@ class hierarchical_interleaved_bloom_filter
//!\brief Returns a membership_agent to be used for counting.
membership_agent_type membership_agent() const;

/*!\brief Returns a counting_agent_type to be used for counting.
* \tparam value_t The type to use for the counters; must model std::integral.
*/
template <std::integral value_t = uint16_t>
counting_agent_type<value_t> counting_agent() const;

/*!\cond DEV
* \brief Serialisation support function.
* \tparam archive_t Type of `archive`; must satisfy seqan::hibf::cereal_archive.
Expand Down Expand Up @@ -397,4 +407,126 @@ hierarchical_interleaved_bloom_filter::membership_agent() const
return typename hierarchical_interleaved_bloom_filter::membership_agent_type{*this};
}

template <std::integral value_t>
class hierarchical_interleaved_bloom_filter::counting_agent_type
{
private:
//!\brief A pointer to the augmented hierarchical_interleaved_bloom_filter.
hierarchical_interleaved_bloom_filter const * const hibf_ptr{nullptr};

//!\brief Helper for recursive bulk counting.
template <std::ranges::forward_range value_range_t>
void bulk_count_impl(value_range_t && values, int64_t const ibf_idx, size_t const threshold)
{
auto agent = hibf_ptr->ibf_vector[ibf_idx].template counting_agent<value_t>();
auto & result = agent.bulk_count(values);

value_t sum{};

for (size_t bin{}; bin < result.size(); ++bin)
{
sum += result[bin];
auto const current_filename_index = hibf_ptr->ibf_bin_to_user_bin_id[ibf_idx][bin];

if (current_filename_index < 0) // merged bin
{
if (sum >= threshold)
bulk_count_impl(values, hibf_ptr->next_ibf_id[ibf_idx][bin], threshold);
sum = 0u;
}
else if (bin + 1u == result.size() || // last bin
current_filename_index != hibf_ptr->ibf_bin_to_user_bin_id[ibf_idx][bin + 1]) // end of split bin
{
if (sum >= threshold)
result_buffer[current_filename_index] = sum;
sum = 0u;
}
}
}

//!\brief Stores the result of bulk_count().
counting_vector<value_t> result_buffer;

public:
/*!\name Constructors, destructor and assignment
* \{
*/
counting_agent_type() = default; //!< Defaulted.
counting_agent_type(counting_agent_type const &) = default; //!< Defaulted.
counting_agent_type & operator=(counting_agent_type const &) = delete; //!< Deleted. hibf_ptr is const.
counting_agent_type(counting_agent_type &&) = default; //!< Defaulted.
counting_agent_type & operator=(counting_agent_type &&) = delete; //!< Deleted. hibf_ptr is const.
~counting_agent_type() = default; //!< Defaulted.

/*!\brief Construct a counting_agent_type for an existing hierarchical_interleaved_bloom_filter.
* \private
* \param hibf The hierarchical_interleaved_bloom_filter.
*/
explicit counting_agent_type(hierarchical_interleaved_bloom_filter const & hibf) :
hibf_ptr(std::addressof(hibf)),
result_buffer(hibf_ptr->number_of_user_bins)
{}
//!\}

/*!\name Counting
* \{
*/
/*!\brief Counts the occurrences in each bin for all values in a range.
* \tparam value_range_t The type of the range of values. Must model std::ranges::forward_range. The reference type
* must model std::unsigned_integral.
* \param[in] values The range of values to process.
* \param[in] threshold Only determine counts when the count is at least (>=) this high. Must be greater than 0.
* \returns A reference to a seqan::hibf::counting_vector that has a count value for each user bin.
*
* \attention The result of this function must always be bound via reference, e.g. `auto &`, to prevent copying.
* \attention Sequential calls to this function invalidate the previously returned reference.
*
* \details
*
* ### Performance
*
* Providing a threshold can significantly speed up the query as the hierarchical structure
* may avoid to recurse into every part of the HIBF.
*
* Counts that do not exceed the threshold will be reported as `0`.
*
* ### Thread safety
*
* Concurrent invocations of this function are not thread safe, please create a
* seqan::hibf::hierarchical_interleaved_bloom_filter::counting_agent_type for each thread.
*/
template <std::ranges::forward_range value_range_t>
[[nodiscard]] counting_vector<value_t> const & bulk_count(value_range_t && values,
size_t const threshold) & noexcept
{
assert(hibf_ptr != nullptr);
assert(threshold > 0u);
assert(result_buffer.size() == hibf_ptr->number_of_user_bins);

static_assert(std::ranges::forward_range<value_range_t>, "The values must model forward_range.");
static_assert(std::unsigned_integral<std::ranges::range_value_t<value_range_t>>,
"An individual value must be an unsigned integral.");

std::ranges::fill(result_buffer, static_cast<value_t>(0));

bulk_count_impl(values, 0, threshold);

return result_buffer;
}

// `bulk_count` cannot be called on a temporary, since the object the returned reference points to
// is immediately destroyed.
template <std::ranges::range value_range_t>
[[nodiscard]] counting_vector<value_t> const & bulk_count(value_range_t && values,
size_t const threshold) && noexcept = delete;
//!\}
};

template <std::integral value_t>
inline hierarchical_interleaved_bloom_filter::counting_agent_type<value_t>
hierarchical_interleaved_bloom_filter::counting_agent() const
{
return typename hierarchical_interleaved_bloom_filter::counting_agent_type<value_t>{*this};
}

} // namespace seqan::hibf
145 changes: 64 additions & 81 deletions test/unit/hibf/hierarchical_interleaved_bloom_filter_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,16 +155,16 @@ TEST(hibf_test, three_level_hibf)
EXPECT_EQ(config.number_of_user_bins, hibf.number_of_user_bins);
}

TEST(hibf_test, unevenly_sized_and_unique_user_bins)
// std::pow(number, 2) does conversion to double and back to size_t.
// If used more often, consider porting pow with integer overloads from
// https://github.com/seqan/seqan3/blob/8b7d02cb8695369b7baeb4b3042ae7c864b67b8c/include/seqan3/utility/math.hpp
inline size_t squared(size_t const number)
{
// std::pow(number, 2) does conversion do double and back to size_t.
// If used more often, consider porting pow with integer overloads from
// https://github.com/seqan/seqan3/blob/8b7d02cb8695369b7baeb4b3042ae7c864b67b8c/include/seqan3/utility/math.hpp
auto squared = [](size_t const number)
{
return number * number;
};
return number * number;
}

TEST(hibf_test, unevenly_sized_and_unique_user_bins)
{
// Simulate 500 user bins of exponentially increasing size
// No overlap between the user bins happens.
seqan::hibf::config config{.input_fn =
Expand Down Expand Up @@ -232,83 +232,66 @@ TEST(hibf_test, evenly_sized_and_highly_similar_user_bins)
EXPECT_EQ(config.number_of_user_bins, hibf.number_of_user_bins);
}

// #ifdef HIBF_HAS_SEQAN3

// #include <seqan3/alphabet/nucleotide/dna4.hpp>
// #include <seqan3/core/debug_stream.hpp>
// #include <seqan3/core/detail/all_view.hpp>
// #include <seqan3/io/sequence_file/all.hpp>
// #include <seqan3/test/cereal.hpp>
// #include <seqan3/utility/views/deep.hpp>

// TEST(hibf_seqan3_test, input_sequences_of_sequences)
// {
// using namespace seqan::hibf::literals;

// auto kmer_transformation = seqan::hibf::views::kmer_hash(seqan::hibf::ungapped{2u});

// // range of range of sequences
// std::vector<std::vector<std::vector<seqan::hibf::dna4>>> seqs{{"AAAGGGGGGC"_dna4}, {"TTTTTT"_dna4}};

// seqan::hibf::config config
// {
// .input = seqs | seqan::hibf::views::deep{kmer_transformation},
// .rearrange_user_bins = false
// };

// seqan::hibf::hierarchical_interleaved_bloom_filter hibf{config};

// auto agent = hibf.membership_agent();

// std::vector<seqan::hibf::dna4> query{"AAGG"_dna4};
// auto query_kmers = query | kmer_transformation;

// auto result = agent.membership_for(query_kmers, 1);

// seqan::hibf::debug_stream << result << std::endl;
// }

// TEST(hibf_seqan3_test, input_files)
// {
// seqan::hibf::test::tmp_directory tmp{};
// std::filesystem::path f1{tmp.path() / "f1.fa"};
// std::filesystem::path f2{tmp.path() / "f2.fa"};

// {
// std::ofstream out{f1};
// out << ">seq1\nAAAGGGGGGC\n";

// std::ofstream out2{f2};
// out2 << ">seq1\nTTTTTT\n";
// }

// auto transform = seqan::hibf::views::kmer_hash(seqan::hibf::ungapped{2u});

// // range of range of sequences
// std::vector<std::string> filenames{f1.string(), f2.string()};
// auto file_range = filenames | std::views::transform([&transform](auto const & f)
// {
// auto record_transform = std::views::transform([&transform](auto && rec){ return rec.sequence() | transform; });
// return seqan::hibf::detail::all(seqan::hibf::sequence_file_input{f}) | record_transform;
// });

// seqan::hibf::config config
// {
// .input = file_range,
// .rearrange_user_bins = false
// };
TEST(hibf_test, counting_agent_same_bins)
{
// Simulate 500 bins, with the same 500 elements each
seqan::hibf::config config{.input_fn =
[&](size_t const, seqan::hibf::insert_iterator it)
{
for (size_t i = 0u; i < 500u; ++i)
it = i;
},
.number_of_user_bins = 500,
.maximum_fpr = 0.001,
.threads = 4,
.tmax = 64,
.disable_estimate_union = true,
.disable_rearrangement = true};

// seqan::hibf::hierarchical_interleaved_bloom_filter hibf{config};
seqan::hibf::hierarchical_interleaved_bloom_filter hibf{config};
auto agent = hibf.template counting_agent<uint32_t>();

// auto agent = hibf.membership_agent();
auto query = std::views::iota(0u, 500u);
auto & result = agent.bulk_count(query, 1u);

// using namespace seqan::hibf::literals;
for (size_t ub_id = 0u; ub_id < config.number_of_user_bins; ++ub_id)
{
ASSERT_GE(result[ub_id], 500u) << "ub_id: " << ub_id;
// Split bins might have FP
ASSERT_LE(result[ub_id], 502u) << "ub_id: " << ub_id;
}
}

// std::vector<seqan::hibf::dna4> query{"AAGG"_dna4};
TEST(hibf_test, counting_agent_different_bins)
{
// Simulate 500 user bins of exponentially increasing size
// No overlap between the user bins happens.
seqan::hibf::config config{.input_fn =
[&](size_t const ub_id, seqan::hibf::insert_iterator it)
{
size_t const start = squared(ub_id + 1u);
size_t const end = squared(ub_id + 2u) - 1u;
for (size_t i = start; i < end; ++i)
it = i;
},
.number_of_user_bins = 500,
.maximum_fpr = 0.001,
.threads = 4,
.tmax = 64,
.disable_estimate_union = true,
.disable_rearrangement = true};

// auto result = agent.membership_for(query | transform, 1);
seqan::hibf::hierarchical_interleaved_bloom_filter hibf{config};
auto agent = hibf.template counting_agent<uint32_t>();

// seqan::hibf::debug_stream << result << std::endl; // prints [0] since query is found in user bin 0
// }
auto query = std::views::iota(1u, 251'000u); // 501^2 - 1
auto & result = agent.bulk_count(query, 1u);

// #endif // HIBF_HAS_SEQAN3
for (size_t ub_id = 0u; ub_id < config.number_of_user_bins; ++ub_id)
{
// Hashes are consecutive integer values and are hence a bad input for the HIBF.
// Only checking for minimum count.
size_t const min_count = 2 * ub_id + 2; // size of [ squared(ub_id + 1u), ..., squared(ub_id + 2u) - 1u )
ASSERT_GE(result[ub_id], min_count) << "ub_id: " << ub_id;
}
}

0 comments on commit 6b6dfc8

Please sign in to comment.