From 2826ef57154810c6e6efc73e1e1b461cba9cc054 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Tue, 28 Nov 2023 10:49:27 +0100 Subject: [PATCH] [FEATURE] SIMD add --- include/hibf/interleaved_bloom_filter.hpp | 17 ++ include/hibf/misc/counting_vector.hpp | 217 ++++++++++++++---- include/hibf/platform.hpp | 12 + .../hibf/interleaved_bloom_filter_test.cpp | 21 +- 4 files changed, 206 insertions(+), 61 deletions(-) diff --git a/include/hibf/interleaved_bloom_filter.hpp b/include/hibf/interleaved_bloom_filter.hpp index 1f6a65b3..29a805a9 100644 --- a/include/hibf/interleaved_bloom_filter.hpp +++ b/include/hibf/interleaved_bloom_filter.hpp @@ -535,8 +535,25 @@ class interleaved_bloom_filter::counting_agent_type explicit counting_agent_type(interleaved_bloom_filter const & ibf) : ibf_ptr(std::addressof(ibf)), membership_agent(ibf), +#if !HIBF_HAS_AVX512 result_buffer(ibf.bin_count()) {} +#else + // AVX512 will access result_buffer's memory in chunks, so we need to make sure that we allocate enough memory + // such that the last chunk is not out of bounds. + result_buffer(next_multiple_of_64(ibf.bin_count())) // Ensure large enough capacity. + { + result_buffer.resize(ibf.bin_count()); // Resize to actual requested size. + // Silences llvm's ASAN container-overflow warning. +# if defined(_LIBCPP_VERSION) && !defined(_LIBCPP_HAS_NO_ASAN) + __sanitizer_annotate_contiguous_container(result_buffer.data(), + result_buffer.data() + result_buffer.capacity(), + result_buffer.data() + result_buffer.size(), + result_buffer.data() + result_buffer.capacity()); +# endif + } +#endif + //!\} /*!\name Counting diff --git a/include/hibf/misc/counting_vector.hpp b/include/hibf/misc/counting_vector.hpp index e0eeba31..4cb50e3e 100644 --- a/include/hibf/misc/counting_vector.hpp +++ b/include/hibf/misc/counting_vector.hpp @@ -26,13 +26,103 @@ #include // for config #include // for aligned_allocator #include // for bit_vector +#include +#include #include // for CEREAL_SERIALIZE_FUNCTION_NAME #include // for base_class +#if HIBF_HAS_AVX512 +# include +# include +# include +# include +# include +# include +#endif + namespace seqan::hibf { +#if HIBF_HAS_AVX512 +//!\cond +// Since the counting_vector can have different value types, we need specific SIMD instructions for each value type. +template +struct simd_mapping +{}; + +// CRTP base class for the simd_mapping, containg common functionality. +template + requires std::same_as> +struct simd_mapping_crtp +{ + // Let `B = sizeof(integral_t) * CHAR_BIT`, e.g. 8 for (u)int8_t, and 16 for (u)int16_t. + // We can process `512 / B` bits of the bit_vector at once. + static inline constexpr size_t bits_per_iterations = 512u / (sizeof(integral_t) * CHAR_BIT); + // clang-format off + // The type that is need to represent `bits_per_iterations` bits. + using bits_type = std::conditional_t>>>; + // clang-format on + static_assert(!std::same_as, "Unsupported bits_type."); + + // Takes B bits from the bit_vector and expands them to a bits_type. + // E.g., B = 8 : [1,0,1,1,0,0,1,0] -> [0...01, 0...00, 0...01, 0...01, 0...00, 0...00, 0...01, 0...00], where + // each element is 64 bits wide. + static inline constexpr auto expand_bits(bits_type const * const bits) + { + return derived_t::mm512_maskz_mov_epi(*bits, derived_t::mm512_set1_epi(1)); + } +}; + +// SIMD instructions for int8_t and uint8_t. +template + requires (sizeof(integral_t) == 1) +struct simd_mapping : simd_mapping_crtp, integral_t> +{ + static inline constexpr auto mm512_maskz_mov_epi = simde_mm512_maskz_mov_epi8; + static inline constexpr auto mm512_set1_epi = simde_mm512_set1_epi8; + static inline constexpr auto mm512_add_epi = simde_mm512_add_epi8; + static inline constexpr auto mm512_sub_epi = simde_mm512_sub_epi8; +}; + +// SIMD instructions for int16_t and uint16_t. +template + requires (sizeof(integral_t) == 2) +struct simd_mapping : simd_mapping_crtp, integral_t> +{ + static inline constexpr auto mm512_maskz_mov_epi = simde_mm512_maskz_mov_epi16; + static inline constexpr auto mm512_set1_epi = simde_mm512_set1_epi16; + static inline constexpr auto mm512_add_epi = simde_mm512_add_epi16; + static inline constexpr auto mm512_sub_epi = simde_mm512_sub_epi16; +}; + +// SIMD instructions for int32_t and uint32_t. +template + requires (sizeof(integral_t) == 4) +struct simd_mapping : simd_mapping_crtp, integral_t> +{ + static inline constexpr auto mm512_maskz_mov_epi = simde_mm512_maskz_mov_epi32; + static inline constexpr auto mm512_set1_epi = simde_mm512_set1_epi32; + static inline constexpr auto mm512_add_epi = simde_mm512_add_epi32; + static inline constexpr auto mm512_sub_epi = simde_mm512_sub_epi32; +}; + +// SIMD instructions for int64_t and uint64_t. +template + requires (sizeof(integral_t) == 8) +struct simd_mapping : simd_mapping_crtp, integral_t> +{ + static inline constexpr auto mm512_maskz_mov_epi = simde_mm512_maskz_mov_epi64; + static inline constexpr auto mm512_set1_epi = simde_mm512_set1_epi64; + static inline constexpr auto mm512_add_epi = simde_mm512_add_epi64; + static inline constexpr auto mm512_sub_epi = simde_mm512_sub_epi64; +}; +//!\endcond +#endif + /*!\brief A data structure that behaves like a std::vector and can be used to consolidate the results of multiple calls * to seqan::hibf::interleaved_bloom_filter::membership_agent_type::bulk_contains. * \ingroup ibf @@ -57,11 +147,11 @@ namespace seqan::hibf * \include test/snippet/ibf/counting_vector.cpp */ template -class counting_vector : public std::vector +class counting_vector : public std::vector> { private: //!\brief The base type. - using base_t = std::vector; + using base_t = std::vector>; public: /*!\name Constructors, destructor and assignment @@ -78,46 +168,37 @@ class counting_vector : public std::vector //!\} /*!\brief Bin-wise adds the bits of a seqan::hibf::bit_vector. - * \param bit_vector The seqan::hibf::bit_vector. - * \attention The counting_vector must be at least as big as `bit_vector`. - * + * \copydetails operator-=(bit_vector const &) * \details - * * ### Example * * \include test/snippet/ibf/counting_vector.cpp */ counting_vector & operator+=(bit_vector const & bit_vector) { - for_each_set_bin(bit_vector, - [this](size_t const bin) - { - ++(*this)[bin]; - }); + impl(bit_vector); return *this; } /*!\brief Bin-wise subtracts the bits of a seqan::hibf::bit_vector. * \param bit_vector The seqan::hibf::bit_vector. - * \attention The counting_vector must be at least as big as `bit_vector`. + * \pre `counting_vector.size() >= bit_vector.size()` and either: + * * `bit_vector.size() % 64 == 0` + * \pre or + * * `counting_vector.capacity() >= seqan::hibf::next_multiple_of_64(bit_vector.size())` and + * * ∀ bit ∈ [`bit_vector.size()`, `bit_vector.capacity()`) : `bit_vector[bit] == 0`
+ * In practive, this condition might be violated by setting bits in a bit_vector and then shrinking it via + * \ref bit_vector::resize() "resize()". */ counting_vector & operator-=(bit_vector const & bit_vector) { - for_each_set_bin(bit_vector, - [this](size_t const bin) - { - assert((*this)[bin] > 0); - --(*this)[bin]; - }); + impl(bit_vector); return *this; } /*!\brief Bin-wise addition of two `seqan::hibf::counting_vector`s. - * \param rhs The other seqan::hibf::counting_vector. - * \attention The seqan::hibf::counting_vector must be at least as big as `rhs`. - * + * \copydetails operator-=(counting_vector const &) * \details - * * ### Example * * \include test/snippet/ibf/counting_vector.cpp @@ -131,59 +212,93 @@ class counting_vector : public std::vector return *this; } - /*!\brief Bin-wise substraction of two `seqan::hibf::counting_vector`s. + /*!\brief Bin-wise subtraction of two `seqan::hibf::counting_vector`s. * \param rhs The other seqan::hibf::counting_vector. - * \attention The seqan::hibf::counting_vector must be at least as big as `rhs`. + * \pre `counting_vector.size() >= rhs.size()` */ counting_vector & operator-=(counting_vector const & rhs) { assert(this->size() >= rhs.size()); // The counting vector may be bigger than what we need. - std::transform(this->begin(), - this->end(), - rhs.begin(), - this->begin(), - [](auto a, auto b) - { - assert(a >= b); - return a - b; - }); + std::transform(this->begin(), this->end(), rhs.begin(), this->begin(), std::minus()); return *this; } private: - //!\brief Enumerates all bins of a seqan::hibf::bit_vector. - template - void for_each_set_bin(bit_vector const & bit_vector, on_bin_fn_t && on_bin_fn) + //!\brief The operation to perform. + enum class operation + { + add, + sub + }; + + //!\brief Bin-wise adds or subtracts the bits of a seqan::hibf::bit_vector. + template + inline void impl(bit_vector const & bit_vector) { assert(this->size() >= bit_vector.size()); // The counting vector may be bigger than what we need. - size_t const words = (bit_vector.size() + 63u) >> 6; - uint64_t const * const bitvector_raw = bit_vector.data(); +#if HIBF_HAS_AVX512 + // AVX512BW: mm512_maskz_mov_epi, mm512_add_epi + // AVX512F: mm512_set1_epi, _mm512_load_si512, _mm512_store_si512 + using simd = simd_mapping; + using bits_type = typename simd::bits_type; + + bits_type const * bit_vector_ptr = reinterpret_cast(bit_vector.data()); + value_t * counting_vector_ptr = base_t::data(); + + size_t const bits = next_multiple_of_64(bit_vector.size()); + assert(bits <= this->capacity()); // Not enough memory reserved for AVX512 chunk access. + size_t const iterations = bits / simd::bits_per_iterations; + + for (size_t iteration = 0; iteration < iterations; + ++iteration, ++bit_vector_ptr, counting_vector_ptr += simd::bits_per_iterations) + { + simde__m512i load = simde_mm512_load_si512(counting_vector_ptr); + if constexpr (op == operation::add) + { + load = simd::mm512_add_epi(load, simd::expand_bits(bit_vector_ptr)); + } + else + { + load = simd::mm512_sub_epi(load, simd::expand_bits(bit_vector_ptr)); + } + simde_mm512_store_si512(counting_vector_ptr, load); + } +#else + size_t const words = divide_and_ceil(bit_vector.size(), 64u); + uint64_t const * const bit_vector_ptr = bit_vector.data(); - // Jump to the next 1 and return the number of places jumped in the bit_sequence - auto jump_to_next_1bit = [](uint64_t & x) + // Jump to the next 1 and return the number of jumped bits in value. + auto jump_to_next_1bit = [](uint64_t & value) { - auto const zeros = std::countr_zero(x); - x >>= zeros; // skip number of zeros + auto const zeros = std::countr_zero(value); + value >>= zeros; // skip number of zeros return zeros; }; - // Each iteration can handle 64 bits - for (size_t batch = 0; batch < words; ++batch) + // Each iteration can handle 64 bits, i.e., one word. + for (size_t iteration = 0; iteration < words; ++iteration) { - // get 64 bits starting at position `bit_pos` - uint64_t bit_sequence = bitvector_raw[batch]; + uint64_t current_word = bit_vector_ptr[iteration]; - // process each relative bin inside the bit_sequence - for (size_t bin = batch << 6; bit_sequence != 0u; ++bin, bit_sequence >>= 1) + // For each set bit in the current word, add/subtract 1 to the corresponding bin. + for (size_t bin = iteration * 64u; current_word != 0u; ++bin, current_word >>= 1) { - // Jump to the next 1 and - bin += jump_to_next_1bit(bit_sequence); + // Jump to the next 1 + bin += jump_to_next_1bit(current_word); - on_bin_fn(bin); + if constexpr (op == operation::add) + { + ++(*this)[bin]; + } + else + { + --(*this)[bin]; + } } } +#endif } }; diff --git a/include/hibf/platform.hpp b/include/hibf/platform.hpp index 58b284f4..75dc5c66 100644 --- a/include/hibf/platform.hpp +++ b/include/hibf/platform.hpp @@ -49,6 +49,18 @@ # define HIBF_DISABLE_COMPILER_CHECK #endif // HIBF_DOXYGEN_ONLY(1)0 +/*!\def HIBF_HAS_AVX512 + * \brief Whether AVX512F and AVX512BW are available. + * \ingroup core + */ +#ifndef HIBF_HAS_AVX512 +# if __AVX512F__ && __AVX512BW__ +# define HIBF_HAS_AVX512 1 +# else +# define HIBF_HAS_AVX512 0 +# endif +#endif + // ============================================================================ // Compiler support GCC // ============================================================================ diff --git a/test/unit/hibf/interleaved_bloom_filter_test.cpp b/test/unit/hibf/interleaved_bloom_filter_test.cpp index e87c4b45..c0936f03 100644 --- a/test/unit/hibf/interleaved_bloom_filter_test.cpp +++ b/test/unit/hibf/interleaved_bloom_filter_test.cpp @@ -238,23 +238,23 @@ TEST(ibf_test, counting) counting += agent.bulk_contains(hash); } std::vector expected(128, 128); - EXPECT_EQ(counting, expected); + EXPECT_RANGE_EQ(counting, expected); // Counting vectors can be added together std::vector expected2(128, 256); counting += counting; - EXPECT_EQ(counting, expected2); + EXPECT_RANGE_EQ(counting, expected2); // minus bit_vector for (size_t hash : std::views::iota(0, 128)) // test correct resize for each bin individually { counting -= agent.bulk_contains(hash); } - EXPECT_EQ(counting, std::vector(128, 128)); + EXPECT_RANGE_EQ(counting, std::vector(128, 128)); // minus other counting vector counting -= seqan::hibf::counting_vector(128, 128 - 42); - EXPECT_EQ(counting, std::vector(128, 42)); + EXPECT_RANGE_EQ(counting, std::vector(128, 42)); } TEST(ibf_test, counting_agent) @@ -299,25 +299,26 @@ TEST(ibf_test, counting_no_ub) std::vector expected(128, 0); expected[63] = 128; expected[127] = 128; - EXPECT_EQ(counting, expected); + EXPECT_RANGE_EQ(counting, expected); // Counting vectors can be added together std::vector expected2(128, 0); expected2[63] = 256; expected2[127] = 256; counting += counting; - EXPECT_EQ(counting, expected2); + EXPECT_RANGE_EQ(counting, expected2); } // Check special case where there is only one `1` in the bitvector. +// Also checks that counting with `b` bins and `b % 64 != 0` works. TEST(ibf_test, counting_agent_no_ub) { // 1. Construct and emplace - seqan::hibf::interleaved_bloom_filter ibf{seqan::hibf::bin_count{128u}, + seqan::hibf::interleaved_bloom_filter ibf{seqan::hibf::bin_count{127u}, seqan::hibf::bin_size{1024u}, seqan::hibf::hash_function_count{2u}}; - for (size_t bin_idx : std::array{63, 127}) + for (size_t bin_idx : std::array{63, 126}) for (size_t hash : std::views::iota(0, 128)) ibf.emplace(hash, seqan::hibf::bin_index{bin_idx}); @@ -325,9 +326,9 @@ TEST(ibf_test, counting_agent_no_ub) auto agent = ibf.counting_agent(); auto agent2 = ibf.template counting_agent(); - std::vector expected(128, 0); + std::vector expected(127, 0); expected[63] = 128; - expected[127] = 128; + expected[126] = 128; EXPECT_RANGE_EQ(agent.bulk_count(std::views::iota(0u, 128u)), expected); EXPECT_RANGE_EQ(agent2.bulk_count(std::views::iota(0u, 128u)), expected); }