Merge branch 'dev' into bucket-refactoring

NVIDIA · Oct 31, 2024 · f4b764c · f4b764c
2 parents 0d45a26 + 69817e2
commit f4b764c
Show file tree

Hide file tree

Showing 20 changed files with 585 additions and 64 deletions.
diff --git a/README.md b/README.md
@@ -254,4 +254,4 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection
 `cuco::bloom_filter` implements a Blocked Bloom Filter for approximate set membership queries.
 
 #### Examples:
-- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/bloom_filter/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJydVm1v20YM_iuE9mF2I78FCwo4L4CXpJuxwsnstEUxD8HpRFuHynfavdgxgvz38U6SLSduMSwBEptHPnz4kEfpOTJojFDSRMO_niORRsNBHOVMLh1bYjSMuEtZFEdGOc399967uYR3cK2KrRbLzEKLt-G0f_pLDJPP45vxCK7vpvd309HD-G7S9b7B_6PgKA2m4GSKGmyGMCoYp3_VSQyfUXsicNrtQ8s7zKPqbB61zwPKVjlYsS1IZcEZJBhhYCFyBHziWFgQErhaFblgkiNshM1Cqgon0IGvFYhKLCN_RhEFfVs0PYHZHXX_k1lbDHu9zWbTZYF2V-llLy-dTe_j-Pp2MrvtEPVd2CeZk7Kg8R8nNBWebIEVxIyzhPjmbANKA1tqpDOrPPONFlbIZQxGLeyGaQw4qTBWi8TZA_FqnlR_04HkY5KEG81gPJtH8OtoNp7FAefL-OH3u08P8GU0nY4mD-PbGdxNqVmTm7FvFX37AKPJV_hjPLmJAUk6SoVPhfZVEFXhZcW01HCGeEBjoUpapkAuFoJDPUGwVGvUksqCAvVKlLNGJNOAk4uVsMwG25viQqreXM7lT0Ly3KUIF9xx1UtypVaP1HeLustddnXoYzPtjO1x5aTt-sM3RymuKcXjGrlV-rgLPiF3nthjoahp2-NehrqLNGrd1xyEoqYgWwWzkJYmTsjWWom0PZfPVBj0evAbStTMIgz6P_f7ffiG2yANDYNBbYMWC6GNhbNwTjiqMobSPYzH5iSf9a0C6VaPAeWywjw_7mMLGurLvfs76HfPvucrD307FcK5Ly3UMSvYxl-kklZ57U6Jr3FJp7SZwNU3bzhsdu-C0l1Vcc8U8lKBlvIOhweNKp09h1ZNJuyFnXfdjZY_6ia4JMnbcYjookz950G7ysCc17J4DF5UXjPkvOlAgaH-ne_Jvv7aS-5gyoCDoxpgT2Ov3Ph1ozOWL-pdFAK8Y9VulqatmkVcZWr_ULBEqfzKe9IldrltlcRjWLDc4KF2RwNlM_Cg-Q2MqpQ_HeptYzp3O4E2yFooZ_JtNde0p3alUeCDX-ImUy5PoUwHYS1b7bBTKENLcY0Q7gkJ83A_vRw0VaFh9XvcvJEm3hdeN_a_0A1l2oxZoA0cnjR-BaIMrF5dPj_7wj9pwkxXJyYEFlolLKfVTMswZZYB7QPHrSOsuAFToeBTJhJhjb9EXtdXdX-4n171IcWCyvKrVJVMqBUJMSdVQqyoa2tU5bfJoWvGTIbGPz5Tv5J9vcfllDs5ZSWn_I6ci1yRXF5tT_bSmypjqx6vsIxbh8N2pEFNU3Vj_Ri029CrAMvxK2e3zLv4H3lfV9I0_SCvrCs2Ng3YFi4u6HHrh5KetvS5FiGYfd8q-2JvD7GUJK-wNNJUSPCr-oXetPz7C73R6P0LWSTXnA9Oz9yAjlVhy7e1qENAl_zkZPAeOkzz7NKsHt_3odOhzW3pj6UcmHZytkrCK1wukgYm5zwn47p86SID1Su_RS9xfU67-uCctIte_g6__wINDYAL))
+- [Host-bulk APIs (Default fingerprinting policy)](https://github.com/NVIDIA/cuCollections/blob/dev/examples/bloom_filter/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJydVmtvGjkU_StXsx8WmuEVbVUJQiSapLtoK5IF2qpaVsjj8TBWBnvqBwRF-e977ZmBgZBqtVRqwL6Pc889vvZzoJnWXAod9P9-Dngc9HthkBGxsmTFgn5AbUyCMNDSKup-d94tBLyDG5nvFF-lBhq0CZfdy99CmHwd345HcHM_fbifjubj-0nb2Xr7z5wyoVkMVsRMgUkZjHJC8U-5E8JXphwQuGx3oeEMFkG5twiaAx9lJy2syQ6ENGA1wzBcQ8IzBuyJstwAF0DlOs84EZTBlpvUpyrjeDjwvQwiI0PQnqBHjr-SuiUQs4fuPqkxeb_T2W63beJht6VadbLCWHc-j2_uJrO7FkLfu30RGTILiv2wXGHh0Q5IjsgoiRBvRrYgFZCVYrhnpEO-VdxwsQpBy8RsiWI-Tsy1UTyy5oi8CifWXzdA-ohA4kYzGM8WAXwczcaz0Mf5Np7_cf9lDt9G0-loMh_fzeB-is2a3I5dq_DXJxhNvsOf48ltCAypw1TsKVeuCoTKHa0sLjicMXYEI5EFLJ0zyhNOoVIQrOSGKYFlQc7UmhdaQ5Cxj5PxNTfE-LVXxflUnYVYiF-4oJmNGVxRS2UnyqRcL7Hvhqk2ten1sY1JldWmQ6UVpu02X23FbIMplhtGjVTnTdgTo9YBW-YSm7Y7b6Wxuwyl1j7FwCU2hZG1X-bCoOK4aGwkj5sL8YyFgVukWLdxHIOw6-Uj22kntiH0ur92u90B7D-dTucKfmeCKWJYuQ3O_nwkkxduw0Pcd9Btvx-UkcbIrjKe64QrbSAlWeLjuWCy3PD0vpFAvErQKjMPXntoG5Xd0uhx6SvzOPCr22xVmygR8M0tszvynKmDBmaXF0I7tgCcBE5eaLL0JkOXfbB3neVk60553auYDDFLiM0MFA12mjwF5Kt3kuv365q7qnJdl_GeaxW-lKm1ift91KCBqys8kR9t9ojAPO-e47fxJFgOU7lyJK6Knru5WMDsY6yFgNoHw_tcTMRZmbtQZ79_pPMaatewRtU5P1v3LpWiG26rHbEVyrYZeo82ZnDfe80yDbFOK_nSWyHvdZdB3QAdvVj2thd1sRRWYh-mcDjaqgIcYBwafF7M5Tz3Ds6wlDOJ40aFIiwzNX_KWiRldu0scRBidxoF8BASkml2zN1ZR1F3PDoptRhlKX9Zpna107efqziFN1xane1KDeGs35eGjnN3EepU2iyGIh34q80oy1q51HixbBj44YHEzB-mw16dFTyq7i7Ur6gJD4VXjf0vcH2ZJiUG8Bbzt7W7RpjwqE6Gizui3N3W9QOhvWOuZEQyvN7wQomJIahzZamxGCushSmjsKeUR9yga8HrSd2fHqbXXTxjOZbl5oUskGArIkSOrHhfXtVWq8oNhmPTlOiUafcEif3MwnrP0yn2dIqSTvEGnUkmkS7HtgM7LI64X2xU8vIXWuNYbGcaVF8qT6yTQbMJnTJgIb9Cu0Xe5H_kPa2kvvSTvKJ5fkA6UeKLBb9XJPhl17dyPTmsnw48xVAVArr48wVfq-4NiK9CdXjUBmJDae_yve3htsxN8eINWhhoSC8ueh-gRRRNh3q9_NCFVgvvLYP_GczB4lZG1pF_Bmc8qsWklGa4uCkerriA9YrH4CWs9vHmONpH7oKXf_y_fwHeCexw))
diff --git a/benchmarks/bloom_filter/add_bench.cu b/benchmarks/bloom_filter/add_bench.cu
@@ -41,9 +41,9 @@ template <typename Key, typename Hash, typename Word, nvbench::int32_t WordsPerB
 void bloom_filter_add(nvbench::state& state,
                       nvbench::type_list<Key, Hash, Word, nvbench::enum_type<WordsPerBlock>, Dist>)
 {
-  using policy_type = cuco::bloom_filter_policy<rebind_hasher_t<Hash, Key>,
-                                                Word,
-                                                static_cast<std::uint32_t>(WordsPerBlock)>;
+  using policy_type = cuco::default_filter_policy<rebind_hasher_t<Hash, Key>,
+                                                  Word,
+                                                  static_cast<std::uint32_t>(WordsPerBlock)>;
   using filter_type =
     cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, policy_type>;
 
@@ -83,6 +83,51 @@ void bloom_filter_add(nvbench::state& state,
   });
 }
 
+/**
+ * @brief A benchmark evaluating `cuco::bloom_filter::add_async` performance with
+ * `arrow_filter_policy`
+ */
+template <typename Key, typename Dist>
+void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list<Key, Dist>)
+{
+  using policy_type = cuco::arrow_filter_policy<Key>;
+  using filter_type =
+    cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, policy_type>;
+
+  auto const num_keys       = state.get_int64("NumInputs");
+  auto const filter_size_mb = state.get_int64("FilterSizeMB");
+
+  std::size_t const num_sub_filters =
+    (filter_size_mb * 1024 * 1024) /
+    (sizeof(typename filter_type::word_type) * filter_type::words_per_block);
+
+  if (num_sub_filters > policy_type::max_filter_blocks) {
+    state.skip("bloom filter with arrow policy should have <= 4194304 blocks");  // skip invalid
+                                                                                 // configurations
+  }
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  state.add_element_count(num_keys);
+
+  filter_type filter{num_sub_filters};
+
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  add_fpr_summary(state, filter);
+
+  state.exec([&](nvbench::launch& launch) {
+    filter.add_async(keys.begin(), keys.end(), {launch.get_stream()});
+  });
+}
+
 NVBENCH_BENCH_TYPES(bloom_filter_add,
                     NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
                                       nvbench::type_list<defaults::BF_HASH>,
@@ -118,3 +163,12 @@ NVBENCH_BENCH_TYPES(bloom_filter_add,
   .set_max_noise(defaults::MAX_NOISE)
   .add_int64_axis("NumInputs", {defaults::BF_N})
   .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB});
+
+NVBENCH_BENCH_TYPES(arrow_bloom_filter_add,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("arrow_bloom_filter_add_unique_size")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("NumInputs", {defaults::BF_N})
+  .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE);
diff --git a/benchmarks/bloom_filter/contains_bench.cu b/benchmarks/bloom_filter/contains_bench.cu
@@ -43,9 +43,9 @@ void bloom_filter_contains(
 {
   // cudaDeviceSetLimit(cudaLimitMaxL2FetchGranularity, 32); // slightly improves peformance if
   // filter block fits into a 32B sector
-  using policy_type = cuco::bloom_filter_policy<rebind_hasher_t<Hash, Key>,
-                                                Word,
-                                                static_cast<std::uint32_t>(WordsPerBlock)>;
+  using policy_type = cuco::default_filter_policy<rebind_hasher_t<Hash, Key>,
+                                                  Word,
+                                                  static_cast<std::uint32_t>(WordsPerBlock)>;
   using filter_type =
     cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, policy_type>;
 
@@ -88,6 +88,56 @@ void bloom_filter_contains(
   });
 }
 
+/**
+ * @brief A benchmark evaluating `cuco::bloom_filter::contains_async` performance with
+ * `arrow_filter_policy`
+ */
+template <typename Key, typename Dist>
+void arrow_bloom_filter_contains(nvbench::state& state, nvbench::type_list<Key, Dist>)
+{
+  // cudaDeviceSetLimit(cudaLimitMaxL2FetchGranularity, 32); // slightly improves peformance if
+  // filter block fits into a 32B sector
+  using policy_type = cuco::arrow_filter_policy<Key>;
+  using filter_type =
+    cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, policy_type>;
+
+  auto const num_keys       = state.get_int64("NumInputs");
+  auto const filter_size_mb = state.get_int64("FilterSizeMB");
+
+  std::size_t const num_sub_filters =
+    (filter_size_mb * 1024 * 1024) /
+    (sizeof(typename filter_type::word_type) * filter_type::words_per_block);
+
+  if (num_sub_filters > policy_type::max_filter_blocks) {
+    state.skip("bloom filter with arrow policy should have <= 4194304 blocks");  // skip invalid
+                                                                                 // configurations
+  }
+
+  thrust::device_vector<Key> keys(num_keys);
+  thrust::device_vector<bool> result(num_keys, false);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  state.add_element_count(num_keys);
+
+  filter_type filter{num_sub_filters};
+
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  add_fpr_summary(state, filter);
+
+  filter.add(keys.begin(), keys.end());
+
+  state.exec([&](nvbench::launch& launch) {
+    filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()});
+  });
+}
+
 NVBENCH_BENCH_TYPES(bloom_filter_contains,
                     NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
                                       nvbench::type_list<defaults::BF_HASH>,
@@ -122,4 +172,13 @@ NVBENCH_BENCH_TYPES(bloom_filter_contains,
   .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"})
   .set_max_noise(defaults::MAX_NOISE)
   .add_int64_axis("NumInputs", {defaults::BF_N})
-  .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB});
+  .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB});
+
+NVBENCH_BENCH_TYPES(arrow_bloom_filter_contains,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("arrow_bloom_filter_contains_unique_size")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("NumInputs", {defaults::BF_N})
+  .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE);
diff --git a/examples/bloom_filter/host_bulk_example.cu b/examples/bloom_filter/host_bulk_example.cu
@@ -25,15 +25,21 @@
 
 int main(void)
 {
-  // Generate 10'000 keys and insert the first 5'000 into the filter.
-  int constexpr num_keys = 10'000;
-  int constexpr num_tp   = num_keys * 0.5;
-  int constexpr num_tn   = num_keys - num_tp;
+  int constexpr num_keys    = 10'000;          ///< Generate 10'000 keys
+  int constexpr num_tp      = num_keys * 0.5;  ///< Insert the first half keys into the filter.
+  int constexpr num_tn      = num_keys - num_tp;
+  int constexpr sub_filters = 200;  ///< 200 sub-filters per bloom filter
 
-  // Spawn a filter with 200 sub-filters.
-  cuco::bloom_filter<int> filter{200};
+  // key type for bloom filter
+  using key_type = int;
 
-  thrust::device_vector<int> keys(num_keys);
+  // Spawn a bloom filter with default policy and 200 sub-filters.
+  cuco::bloom_filter<key_type> filter{sub_filters};
+
+  std::cout << "Bulk insert into bloom filter with default fingerprint generation policy: "
+            << std::endl;
+
+  thrust::device_vector<key_type> keys(num_keys);
   thrust::sequence(keys.begin(), keys.end(), 1);
 
   auto tp_begin = keys.begin();

diff --git a/include/cuco/bloom_filter.cuh b/include/cuco/bloom_filter.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cuco/bloom_filter_policy.cuh>
+#include <cuco/bloom_filter_policies.cuh>
 #include <cuco/bloom_filter_ref.cuh>
 #include <cuco/detail/storage/storage_base.cuh>
 #include <cuco/extent.cuh>
@@ -55,13 +55,13 @@ namespace cuco {
  * @tparam Extent Size type that is used to determine the number of blocks in the filter
  * @tparam Scope The scope in which operations will be performed by individual threads
  * @tparam Policy Type that defines how to generate and store key fingerprints (see
- * `cuco/bloom_filter_policy.cuh`)
+ * `cuco/bloom_filter_policies.cuh`)
  * @tparam Allocator Type of allocator used for device-accessible storage
  */
 template <class Key,
           class Extent             = cuco::extent<std::size_t>,
           cuda::thread_scope Scope = cuda::thread_scope_device,
-          class Policy    = cuco::bloom_filter_policy<cuco::xxhash_64<Key>, std::uint32_t, 8>,
+          class Policy    = cuco::default_filter_policy<cuco::xxhash_64<Key>, std::uint32_t, 8>,
           class Allocator = cuco::cuda_allocator<cuda::std::byte>>
 class bloom_filter {
  public:
@@ -109,7 +109,7 @@ class bloom_filter {
    *
    * @param num_blocks Number of sub-filters or blocks
    * @param scope The scope in which operations will be performed
-   * @param policy Fingerprint generation policy (see `cuco/bloom_filter_policy.cuh`)
+   * @param policy Fingerprint generation policy (see `cuco/bloom_filter_policies.cuh`)
    * @param alloc Allocator used for allocating device-accessible storage
    * @param stream CUDA stream used to initialize the filter
    */

diff --git a/include/cuco/bloom_filter_policy.cuh → include/cuco/bloom_filter_policies.cuh b/include/cuco/bloom_filter_policy.cuh → include/cuco/bloom_filter_policies.cuh
@@ -16,14 +16,25 @@
 
 #pragma once
 
-#include <cuco/detail/bloom_filter/bloom_filter_policy_impl.cuh>
+#include <cuco/detail/bloom_filter/arrow_filter_policy.cuh>
+#include <cuco/detail/bloom_filter/default_filter_policy_impl.cuh>
 
 #include <cstdint>
 
 namespace cuco {
 
 /**
- * @brief A policy that defines how a Blocked Bloom Filter generates and stores a key's fingerprint.
+ * @brief A policy that defines how Arrow Block-Split Bloom Filter generates and stores a key's
+ * fingerprint.
+ *
+ * @tparam Key The type of the values to generate a fingerprint for.
+ */
+template <class Key>
+using arrow_filter_policy = detail::arrow_filter_policy<Key>;
+
+/**
+ * @brief The default policy that defines how a Blocked Bloom Filter generates and stores a key's
+ * fingerprint.
  *
  * @note `Word` type must be an atomically updatable integral type. `WordsPerBlock` must
  * be a power-of-two.
@@ -33,8 +44,8 @@ namespace cuco {
  * @tparam WordsPerBlock Number of words/segments in each block
  */
 template <class Hash, class Word, std::uint32_t WordsPerBlock>
-class bloom_filter_policy {
-  using impl_type = cuco::detail::bloom_filter_policy_impl<Hash, Word, WordsPerBlock>;
+class default_filter_policy {
+  using impl_type = cuco::detail::default_filter_policy_impl<Hash, Word, WordsPerBlock>;
 
  public:
   using hasher             = typename impl_type::hasher;              ///< Type of the hash function
@@ -48,7 +59,7 @@ class bloom_filter_policy {
 
  public:
   /**
-   * @brief Constructs the `bloom_filter_policy` object.
+   * @brief Constructs the `default_filter_policy` object.
    *
    * @throws Compile-time error if the specified number of words in a filter block is not a
    * power-of-two or is larger than 32. If called from host: throws exception; If called from
@@ -64,8 +75,8 @@ class bloom_filter_policy {
    * @param pattern_bits Number of bits in a key's fingerprint
    * @param hash Hash function used to generate a key's fingerprint
    */
-  __host__ __device__ constexpr bloom_filter_policy(std::uint32_t pattern_bits = words_per_block,
-                                                    Hash hash                  = {});
+  __host__ __device__ constexpr default_filter_policy(std::uint32_t pattern_bits = words_per_block,
+                                                      Hash hash                  = {});
 
   /**
    * @brief Generates the hash value for a given key.
@@ -116,4 +127,4 @@ class bloom_filter_policy {
 
 }  // namespace cuco
 
-#include <cuco/detail/bloom_filter/bloom_filter_policy.inl>
+#include <cuco/detail/bloom_filter/default_filter_policy.inl>
diff --git a/include/cuco/bloom_filter_ref.cuh b/include/cuco/bloom_filter_ref.cuh
@@ -33,7 +33,7 @@ namespace cuco {
  * @tparam Extent Size type that is used to determine the number of blocks in the filter
  * @tparam Scope The scope in which operations will be performed by individual threads
  * @tparam Policy Type that defines how to generate and store key fingerprints (see
- * `cuco/bloom_filter_policy.cuh`)
+ * `cuco/bloom_filter_policies.cuh`)
  */
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
 class bloom_filter_ref {
@@ -60,7 +60,7 @@ class bloom_filter_ref {
    * @param data Pointer to the storage span of the filter
    * @param num_blocks Number of sub-filters or blocks
    * @param scope The scope in which operations will be performed
-   * @param policy Fingerprint generation policy (see `cuco/bloom_filter_policy.cuh`)
+   * @param policy Fingerprint generation policy (see `cuco/bloom_filter_policies.cuh`)
    */
   __host__ __device__ explicit constexpr bloom_filter_ref(word_type* data,
                                                           Extent num_blocks,