Skip to content

Commit

Permalink
Merge branch 'dev' into bucket-refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
PointKernel authored Oct 31, 2024
2 parents 0d45a26 + 69817e2 commit f4b764c
Show file tree
Hide file tree
Showing 20 changed files with 585 additions and 64 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -254,4 +254,4 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection
`cuco::bloom_filter` implements a Blocked Bloom Filter for approximate set membership queries.

#### Examples:
- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/bloom_filter/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJydVm1v20YM_iuE9mF2I78FCwo4L4CXpJuxwsnstEUxD8HpRFuHynfavdgxgvz38U6SLSduMSwBEptHPnz4kEfpOTJojFDSRMO_niORRsNBHOVMLh1bYjSMuEtZFEdGOc399967uYR3cK2KrRbLzEKLt-G0f_pLDJPP45vxCK7vpvd309HD-G7S9b7B_6PgKA2m4GSKGmyGMCoYp3_VSQyfUXsicNrtQ8s7zKPqbB61zwPKVjlYsS1IZcEZJBhhYCFyBHziWFgQErhaFblgkiNshM1Cqgon0IGvFYhKLCN_RhEFfVs0PYHZHXX_k1lbDHu9zWbTZYF2V-llLy-dTe_j-Pp2MrvtEPVd2CeZk7Kg8R8nNBWebIEVxIyzhPjmbANKA1tqpDOrPPONFlbIZQxGLeyGaQw4qTBWi8TZA_FqnlR_04HkY5KEG81gPJtH8OtoNp7FAefL-OH3u08P8GU0nY4mD-PbGdxNqVmTm7FvFX37AKPJV_hjPLmJAUk6SoVPhfZVEFXhZcW01HCGeEBjoUpapkAuFoJDPUGwVGvUksqCAvVKlLNGJNOAk4uVsMwG25viQqreXM7lT0Ly3KUIF9xx1UtypVaP1HeLustddnXoYzPtjO1x5aTt-sM3RymuKcXjGrlV-rgLPiF3nthjoahp2-NehrqLNGrd1xyEoqYgWwWzkJYmTsjWWom0PZfPVBj0evAbStTMIgz6P_f7ffiG2yANDYNBbYMWC6GNhbNwTjiqMobSPYzH5iSf9a0C6VaPAeWywjw_7mMLGurLvfs76HfPvucrD307FcK5Ly3UMSvYxl-kklZ57U6Jr3FJp7SZwNU3bzhsdu-C0l1Vcc8U8lKBlvIOhweNKp09h1ZNJuyFnXfdjZY_6ia4JMnbcYjookz950G7ysCc17J4DF5UXjPkvOlAgaH-ne_Jvv7aS-5gyoCDoxpgT2Ov3Ph1ozOWL-pdFAK8Y9VulqatmkVcZWr_ULBEqfzKe9IldrltlcRjWLDc4KF2RwNlM_Cg-Q2MqpQ_HeptYzp3O4E2yFooZ_JtNde0p3alUeCDX-ImUy5PoUwHYS1b7bBTKENLcY0Q7gkJ83A_vRw0VaFh9XvcvJEm3hdeN_a_0A1l2oxZoA0cnjR-BaIMrF5dPj_7wj9pwkxXJyYEFlolLKfVTMswZZYB7QPHrSOsuAFToeBTJhJhjb9EXtdXdX-4n171IcWCyvKrVJVMqBUJMSdVQqyoa2tU5bfJoWvGTIbGPz5Tv5J9vcfllDs5ZSWn_I6ci1yRXF5tT_bSmypjqx6vsIxbh8N2pEFNU3Vj_Ri029CrAMvxK2e3zLv4H3lfV9I0_SCvrCs2Ng3YFi4u6HHrh5KetvS5FiGYfd8q-2JvD7GUJK-wNNJUSPCr-oXetPz7C73R6P0LWSTXnA9Oz9yAjlVhy7e1qENAl_zkZPAeOkzz7NKsHt_3odOhzW3pj6UcmHZytkrCK1wukgYm5zwn47p86SID1Su_RS9xfU67-uCctIte_g6__wINDYAL))
- [Host-bulk APIs (Default fingerprinting policy)](https://github.com/NVIDIA/cuCollections/blob/dev/examples/bloom_filter/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJydVmtvGjkU_StXsx8WmuEVbVUJQiSapLtoK5IF2qpaVsjj8TBWBnvqBwRF-e977ZmBgZBqtVRqwL6Pc889vvZzoJnWXAod9P9-Dngc9HthkBGxsmTFgn5AbUyCMNDSKup-d94tBLyDG5nvFF-lBhq0CZfdy99CmHwd345HcHM_fbifjubj-0nb2Xr7z5wyoVkMVsRMgUkZjHJC8U-5E8JXphwQuGx3oeEMFkG5twiaAx9lJy2syQ6ENGA1wzBcQ8IzBuyJstwAF0DlOs84EZTBlpvUpyrjeDjwvQwiI0PQnqBHjr-SuiUQs4fuPqkxeb_T2W63beJht6VadbLCWHc-j2_uJrO7FkLfu30RGTILiv2wXGHh0Q5IjsgoiRBvRrYgFZCVYrhnpEO-VdxwsQpBy8RsiWI-Tsy1UTyy5oi8CifWXzdA-ohA4kYzGM8WAXwczcaz0Mf5Np7_cf9lDt9G0-loMh_fzeB-is2a3I5dq_DXJxhNvsOf48ltCAypw1TsKVeuCoTKHa0sLjicMXYEI5EFLJ0zyhNOoVIQrOSGKYFlQc7UmhdaQ5Cxj5PxNTfE-LVXxflUnYVYiF-4oJmNGVxRS2UnyqRcL7Hvhqk2ten1sY1JldWmQ6UVpu02X23FbIMplhtGjVTnTdgTo9YBW-YSm7Y7b6Wxuwyl1j7FwCU2hZG1X-bCoOK4aGwkj5sL8YyFgVukWLdxHIOw6-Uj22kntiH0ur92u90B7D-dTucKfmeCKWJYuQ3O_nwkkxduw0Pcd9Btvx-UkcbIrjKe64QrbSAlWeLjuWCy3PD0vpFAvErQKjMPXntoG5Xd0uhx6SvzOPCr22xVmygR8M0tszvynKmDBmaXF0I7tgCcBE5eaLL0JkOXfbB3neVk60553auYDDFLiM0MFA12mjwF5Kt3kuv365q7qnJdl_GeaxW-lKm1ift91KCBqys8kR9t9ojAPO-e47fxJFgOU7lyJK6Knru5WMDsY6yFgNoHw_tcTMRZmbtQZ79_pPMaatewRtU5P1v3LpWiG26rHbEVyrYZeo82ZnDfe80yDbFOK_nSWyHvdZdB3QAdvVj2thd1sRRWYh-mcDjaqgIcYBwafF7M5Tz3Ds6wlDOJ40aFIiwzNX_KWiRldu0scRBidxoF8BASkml2zN1ZR1F3PDoptRhlKX9Zpna107efqziFN1xane1KDeGs35eGjnN3EepU2iyGIh34q80oy1q51HixbBj44YHEzB-mw16dFTyq7i7Ur6gJD4VXjf0vcH2ZJiUG8Bbzt7W7RpjwqE6Gizui3N3W9QOhvWOuZEQyvN7wQomJIahzZamxGCushSmjsKeUR9yga8HrSd2fHqbXXTxjOZbl5oUskGArIkSOrHhfXtVWq8oNhmPTlOiUafcEif3MwnrP0yn2dIqSTvEGnUkmkS7HtgM7LI64X2xU8vIXWuNYbGcaVF8qT6yTQbMJnTJgIb9Cu0Xe5H_kPa2kvvSTvKJ5fkA6UeKLBb9XJPhl17dyPTmsnw48xVAVArr48wVfq-4NiK9CdXjUBmJDae_yve3htsxN8eINWhhoSC8ueh-gRRRNh3q9_NCFVgvvLYP_GczB4lZG1pF_Bmc8qsWklGa4uCkerriA9YrH4CWs9vHmONpH7oKXf_y_fwHeCexw))
60 changes: 57 additions & 3 deletions benchmarks/bloom_filter/add_bench.cu
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ template <typename Key, typename Hash, typename Word, nvbench::int32_t WordsPerB
void bloom_filter_add(nvbench::state& state,
nvbench::type_list<Key, Hash, Word, nvbench::enum_type<WordsPerBlock>, Dist>)
{
using policy_type = cuco::bloom_filter_policy<rebind_hasher_t<Hash, Key>,
Word,
static_cast<std::uint32_t>(WordsPerBlock)>;
using policy_type = cuco::default_filter_policy<rebind_hasher_t<Hash, Key>,
Word,
static_cast<std::uint32_t>(WordsPerBlock)>;
using filter_type =
cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, policy_type>;

Expand Down Expand Up @@ -83,6 +83,51 @@ void bloom_filter_add(nvbench::state& state,
});
}

/**
* @brief A benchmark evaluating `cuco::bloom_filter::add_async` performance with
* `arrow_filter_policy`
*/
template <typename Key, typename Dist>
void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list<Key, Dist>)
{
using policy_type = cuco::arrow_filter_policy<Key>;
using filter_type =
cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, policy_type>;

auto const num_keys = state.get_int64("NumInputs");
auto const filter_size_mb = state.get_int64("FilterSizeMB");

std::size_t const num_sub_filters =
(filter_size_mb * 1024 * 1024) /
(sizeof(typename filter_type::word_type) * filter_type::words_per_block);

if (num_sub_filters > policy_type::max_filter_blocks) {
state.skip("bloom filter with arrow policy should have <= 4194304 blocks"); // skip invalid
// configurations
}

thrust::device_vector<Key> keys(num_keys);

key_generator gen;
gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());

state.add_element_count(num_keys);

filter_type filter{num_sub_filters};

state.collect_dram_throughput();
state.collect_l1_hit_rates();
state.collect_l2_hit_rates();
state.collect_loads_efficiency();
state.collect_stores_efficiency();

add_fpr_summary(state, filter);

state.exec([&](nvbench::launch& launch) {
filter.add_async(keys.begin(), keys.end(), {launch.get_stream()});
});
}

NVBENCH_BENCH_TYPES(bloom_filter_add,
NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
nvbench::type_list<defaults::BF_HASH>,
Expand Down Expand Up @@ -118,3 +163,12 @@ NVBENCH_BENCH_TYPES(bloom_filter_add,
.set_max_noise(defaults::MAX_NOISE)
.add_int64_axis("NumInputs", {defaults::BF_N})
.add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB});

NVBENCH_BENCH_TYPES(arrow_bloom_filter_add,
NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
nvbench::type_list<distribution::unique>))
.set_name("arrow_bloom_filter_add_unique_size")
.set_type_axes_names({"Key", "Distribution"})
.set_max_noise(defaults::MAX_NOISE)
.add_int64_axis("NumInputs", {defaults::BF_N})
.add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE);
67 changes: 63 additions & 4 deletions benchmarks/bloom_filter/contains_bench.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ void bloom_filter_contains(
{
// cudaDeviceSetLimit(cudaLimitMaxL2FetchGranularity, 32); // slightly improves peformance if
// filter block fits into a 32B sector
using policy_type = cuco::bloom_filter_policy<rebind_hasher_t<Hash, Key>,
Word,
static_cast<std::uint32_t>(WordsPerBlock)>;
using policy_type = cuco::default_filter_policy<rebind_hasher_t<Hash, Key>,
Word,
static_cast<std::uint32_t>(WordsPerBlock)>;
using filter_type =
cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, policy_type>;

Expand Down Expand Up @@ -88,6 +88,56 @@ void bloom_filter_contains(
});
}

/**
* @brief A benchmark evaluating `cuco::bloom_filter::contains_async` performance with
* `arrow_filter_policy`
*/
template <typename Key, typename Dist>
void arrow_bloom_filter_contains(nvbench::state& state, nvbench::type_list<Key, Dist>)
{
// cudaDeviceSetLimit(cudaLimitMaxL2FetchGranularity, 32); // slightly improves peformance if
// filter block fits into a 32B sector
using policy_type = cuco::arrow_filter_policy<Key>;
using filter_type =
cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, policy_type>;

auto const num_keys = state.get_int64("NumInputs");
auto const filter_size_mb = state.get_int64("FilterSizeMB");

std::size_t const num_sub_filters =
(filter_size_mb * 1024 * 1024) /
(sizeof(typename filter_type::word_type) * filter_type::words_per_block);

if (num_sub_filters > policy_type::max_filter_blocks) {
state.skip("bloom filter with arrow policy should have <= 4194304 blocks"); // skip invalid
// configurations
}

thrust::device_vector<Key> keys(num_keys);
thrust::device_vector<bool> result(num_keys, false);

key_generator gen;
gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());

state.add_element_count(num_keys);

filter_type filter{num_sub_filters};

state.collect_dram_throughput();
state.collect_l1_hit_rates();
state.collect_l2_hit_rates();
state.collect_loads_efficiency();
state.collect_stores_efficiency();

add_fpr_summary(state, filter);

filter.add(keys.begin(), keys.end());

state.exec([&](nvbench::launch& launch) {
filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()});
});
}

NVBENCH_BENCH_TYPES(bloom_filter_contains,
NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
nvbench::type_list<defaults::BF_HASH>,
Expand Down Expand Up @@ -122,4 +172,13 @@ NVBENCH_BENCH_TYPES(bloom_filter_contains,
.set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"})
.set_max_noise(defaults::MAX_NOISE)
.add_int64_axis("NumInputs", {defaults::BF_N})
.add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB});
.add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB});

NVBENCH_BENCH_TYPES(arrow_bloom_filter_contains,
NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
nvbench::type_list<distribution::unique>))
.set_name("arrow_bloom_filter_contains_unique_size")
.set_type_axes_names({"Key", "Distribution"})
.set_max_noise(defaults::MAX_NOISE)
.add_int64_axis("NumInputs", {defaults::BF_N})
.add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE);
20 changes: 13 additions & 7 deletions examples/bloom_filter/host_bulk_example.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,21 @@

int main(void)
{
// Generate 10'000 keys and insert the first 5'000 into the filter.
int constexpr num_keys = 10'000;
int constexpr num_tp = num_keys * 0.5;
int constexpr num_tn = num_keys - num_tp;
int constexpr num_keys = 10'000; ///< Generate 10'000 keys
int constexpr num_tp = num_keys * 0.5; ///< Insert the first half keys into the filter.
int constexpr num_tn = num_keys - num_tp;
int constexpr sub_filters = 200; ///< 200 sub-filters per bloom filter

// Spawn a filter with 200 sub-filters.
cuco::bloom_filter<int> filter{200};
// key type for bloom filter
using key_type = int;

thrust::device_vector<int> keys(num_keys);
// Spawn a bloom filter with default policy and 200 sub-filters.
cuco::bloom_filter<key_type> filter{sub_filters};

std::cout << "Bulk insert into bloom filter with default fingerprint generation policy: "
<< std::endl;

thrust::device_vector<key_type> keys(num_keys);
thrust::sequence(keys.begin(), keys.end(), 1);

auto tp_begin = keys.begin();
Expand Down
8 changes: 4 additions & 4 deletions include/cuco/bloom_filter.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#pragma once

#include <cuco/bloom_filter_policy.cuh>
#include <cuco/bloom_filter_policies.cuh>
#include <cuco/bloom_filter_ref.cuh>
#include <cuco/detail/storage/storage_base.cuh>
#include <cuco/extent.cuh>
Expand Down Expand Up @@ -55,13 +55,13 @@ namespace cuco {
* @tparam Extent Size type that is used to determine the number of blocks in the filter
* @tparam Scope The scope in which operations will be performed by individual threads
* @tparam Policy Type that defines how to generate and store key fingerprints (see
* `cuco/bloom_filter_policy.cuh`)
* `cuco/bloom_filter_policies.cuh`)
* @tparam Allocator Type of allocator used for device-accessible storage
*/
template <class Key,
class Extent = cuco::extent<std::size_t>,
cuda::thread_scope Scope = cuda::thread_scope_device,
class Policy = cuco::bloom_filter_policy<cuco::xxhash_64<Key>, std::uint32_t, 8>,
class Policy = cuco::default_filter_policy<cuco::xxhash_64<Key>, std::uint32_t, 8>,
class Allocator = cuco::cuda_allocator<cuda::std::byte>>
class bloom_filter {
public:
Expand Down Expand Up @@ -109,7 +109,7 @@ class bloom_filter {
*
* @param num_blocks Number of sub-filters or blocks
* @param scope The scope in which operations will be performed
* @param policy Fingerprint generation policy (see `cuco/bloom_filter_policy.cuh`)
* @param policy Fingerprint generation policy (see `cuco/bloom_filter_policies.cuh`)
* @param alloc Allocator used for allocating device-accessible storage
* @param stream CUDA stream used to initialize the filter
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,25 @@

#pragma once

#include <cuco/detail/bloom_filter/bloom_filter_policy_impl.cuh>
#include <cuco/detail/bloom_filter/arrow_filter_policy.cuh>
#include <cuco/detail/bloom_filter/default_filter_policy_impl.cuh>

#include <cstdint>

namespace cuco {

/**
* @brief A policy that defines how a Blocked Bloom Filter generates and stores a key's fingerprint.
* @brief A policy that defines how Arrow Block-Split Bloom Filter generates and stores a key's
* fingerprint.
*
* @tparam Key The type of the values to generate a fingerprint for.
*/
template <class Key>
using arrow_filter_policy = detail::arrow_filter_policy<Key>;

/**
* @brief The default policy that defines how a Blocked Bloom Filter generates and stores a key's
* fingerprint.
*
* @note `Word` type must be an atomically updatable integral type. `WordsPerBlock` must
* be a power-of-two.
Expand All @@ -33,8 +44,8 @@ namespace cuco {
* @tparam WordsPerBlock Number of words/segments in each block
*/
template <class Hash, class Word, std::uint32_t WordsPerBlock>
class bloom_filter_policy {
using impl_type = cuco::detail::bloom_filter_policy_impl<Hash, Word, WordsPerBlock>;
class default_filter_policy {
using impl_type = cuco::detail::default_filter_policy_impl<Hash, Word, WordsPerBlock>;

public:
using hasher = typename impl_type::hasher; ///< Type of the hash function
Expand All @@ -48,7 +59,7 @@ class bloom_filter_policy {

public:
/**
* @brief Constructs the `bloom_filter_policy` object.
* @brief Constructs the `default_filter_policy` object.
*
* @throws Compile-time error if the specified number of words in a filter block is not a
* power-of-two or is larger than 32. If called from host: throws exception; If called from
Expand All @@ -64,8 +75,8 @@ class bloom_filter_policy {
* @param pattern_bits Number of bits in a key's fingerprint
* @param hash Hash function used to generate a key's fingerprint
*/
__host__ __device__ constexpr bloom_filter_policy(std::uint32_t pattern_bits = words_per_block,
Hash hash = {});
__host__ __device__ constexpr default_filter_policy(std::uint32_t pattern_bits = words_per_block,
Hash hash = {});

/**
* @brief Generates the hash value for a given key.
Expand Down Expand Up @@ -116,4 +127,4 @@ class bloom_filter_policy {

} // namespace cuco

#include <cuco/detail/bloom_filter/bloom_filter_policy.inl>
#include <cuco/detail/bloom_filter/default_filter_policy.inl>
4 changes: 2 additions & 2 deletions include/cuco/bloom_filter_ref.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ namespace cuco {
* @tparam Extent Size type that is used to determine the number of blocks in the filter
* @tparam Scope The scope in which operations will be performed by individual threads
* @tparam Policy Type that defines how to generate and store key fingerprints (see
* `cuco/bloom_filter_policy.cuh`)
* `cuco/bloom_filter_policies.cuh`)
*/
template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
class bloom_filter_ref {
Expand All @@ -60,7 +60,7 @@ class bloom_filter_ref {
* @param data Pointer to the storage span of the filter
* @param num_blocks Number of sub-filters or blocks
* @param scope The scope in which operations will be performed
* @param policy Fingerprint generation policy (see `cuco/bloom_filter_policy.cuh`)
* @param policy Fingerprint generation policy (see `cuco/bloom_filter_policies.cuh`)
*/
__host__ __device__ explicit constexpr bloom_filter_ref(word_type* data,
Extent num_blocks,
Expand Down
Loading

0 comments on commit f4b764c

Please sign in to comment.