Disable --expt-relaxed-constexpr with CCCL enhancements (#595)

With CCCL now more mature, the need to build cuCollections using `--expt-relaxed-constexpr` is no longer necessary. This PR updates the implementations to support disabling `--expt-relaxed-constexpr`.
NVIDIA · Aug 29, 2024 · 4454de4 · 4454de4
1 parent b55e38d
commit 4454de4
Show file tree

Hide file tree

Showing 17 changed files with 96 additions and 87 deletions.
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -35,7 +35,7 @@ function(ConfigureBench BENCH_NAME)
                                         RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/benchmarks")
     target_include_directories(${BENCH_NAME} PRIVATE
                                              "${CMAKE_CURRENT_SOURCE_DIR}")
-    target_compile_options(${BENCH_NAME} PRIVATE --expt-extended-lambda --expt-relaxed-constexpr -lineinfo)
+    target_compile_options(${BENCH_NAME} PRIVATE --expt-extended-lambda -lineinfo)
     target_link_libraries(${BENCH_NAME} PRIVATE
                                         nvbench::main
                                         pthread

diff --git a/benchmarks/hash_function/hash_function_bench.cu b/benchmarks/hash_function/hash_function_bench.cu
@@ -21,9 +21,9 @@
 
 #include <nvbench/nvbench.cuh>
 
+#include <cuda/std/cstddef>
 #include <thrust/device_vector.h>
 
-#include <cstddef>
 #include <cstdint>
 #include <type_traits>
 
@@ -139,8 +139,8 @@ __global__ void string_hash_bench_kernel(
 template <typename Hash>
 void string_hash_eval(nvbench::state& state, nvbench::type_list<Hash>)
 {
-  static_assert(std::is_same_v<typename Hash::argument_type, std::byte>,
-                "Argument type must be std::byte");
+  static_assert(std::is_same_v<typename Hash::argument_type, cuda::std::byte>,
+                "Argument type must be cuda::std::byte");
 
   bool const materialize_result = false;
   constexpr auto block_size     = 128;
@@ -164,7 +164,7 @@ void string_hash_eval(nvbench::state& state, nvbench::type_list<Hash>)
                                                                                      : 1);
 
   state.add_element_count(num_keys);
-  // state.add_global_memory_reads<std::byte>(storage.size() * n_repeats);
+  // state.add_global_memory_reads<cuda::std::byte>(storage.size() * n_repeats);
 
   state.exec([&](nvbench::launch& launch) {
     string_hash_bench_kernel<block_size><<<grid_size, block_size, 0, launch.get_stream()>>>(
@@ -196,12 +196,13 @@ NVBENCH_BENCH_TYPES(
   .set_max_noise(cuco::benchmark::defaults::MAX_NOISE)
   .add_int64_axis("NumInputs", {cuco::benchmark::defaults::N * 10});
 
-NVBENCH_BENCH_TYPES(string_hash_eval,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<cuco::murmurhash3_32<std::byte>,
-                                                         cuco::xxhash_32<std::byte>,
-                                                         cuco::xxhash_64<std::byte>,
-                                                         cuco::murmurhash3_x86_128<std::byte>,
-                                                         cuco::murmurhash3_x64_128<std::byte>>))
+NVBENCH_BENCH_TYPES(
+  string_hash_eval,
+  NVBENCH_TYPE_AXES(nvbench::type_list<cuco::murmurhash3_32<cuda::std::byte>,
+                                       cuco::xxhash_32<cuda::std::byte>,
+                                       cuco::xxhash_64<cuda::std::byte>,
+                                       cuco::murmurhash3_x86_128<cuda::std::byte>,
+                                       cuco::murmurhash3_x64_128<cuda::std::byte>>))
   .set_name("string_hash_function_eval")
   .set_type_axes_names({"Hash"})
   .set_max_noise(cuco::benchmark::defaults::MAX_NOISE)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ function(ConfigureExample EXAMPLE_NAME EXAMPLE_SRC)
     target_include_directories(${EXAMPLE_NAME} PRIVATE
                                              "${CMAKE_CURRENT_SOURCE_DIR}")
     target_compile_options(${EXAMPLE_NAME} PRIVATE --compiler-options=-Wall --compiler-options=-Wextra
-      --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage)
+      --expt-extended-lambda -Xcompiler -Wno-subobject-linkage)
     target_link_libraries(${EXAMPLE_NAME} PRIVATE cuco CUDA::cudart)
 endfunction(ConfigureExample)
 

diff --git a/examples/static_set/device_subsets_example.cu b/examples/static_set/device_subsets_example.cu
@@ -64,7 +64,7 @@ using ref_type         = cuco::static_set_ref<key_type,
                                               storage_ref_type>;  ///< Set ref type
 
 /// Sample data to insert and query
-__device__ constexpr std::array<key_type, N> data = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+__device__ constexpr cuda::std::array<key_type, N> data = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
 /// Empty slots are represented by reserved "sentinel" values. These values should be selected such
 /// that they never occur in your input data.
 key_type constexpr empty_key_sentinel = -1;

diff --git a/include/cuco/detail/__config b/include/cuco/detail/__config
@@ -25,10 +25,6 @@
 #error "NVCC version 11.5 or later is required"
 #endif
 
-#if !defined(__CUDACC_RELAXED_CONSTEXPR__)
-#error "Support for relaxed constexpr is required (nvcc flag --expt-relaxed-constexpr)"
-#endif
-
 #if !defined(__CUDACC_EXTENDED_LAMBDA__)
 #error "Support for extended device lambdas is required (nvcc flag --expt-extended-lambda)"
 #endif

diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh
@@ -20,9 +20,9 @@
 #include <cuco/extent.cuh>
 
 #include <cuda/std/array>
+#include <cuda/std/cstddef>
 #include <cuda/std/type_traits>
 
-#include <cstddef>
 #include <cstdint>
 
 namespace cuco::detail {
@@ -146,7 +146,7 @@ struct MurmurHash3_32 {
    */
   constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
   {
-    return compute_hash(reinterpret_cast<std::byte const*>(&key),
+    return compute_hash(reinterpret_cast<cuda::std::byte const*>(&key),
                         cuco::extent<std::size_t, sizeof(Key)>{});
   }
 
@@ -160,7 +160,7 @@ struct MurmurHash3_32 {
    * @return The resulting hash value
    */
   template <typename Extent>
-  constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes,
+  constexpr result_type __host__ __device__ compute_hash(cuda::std::byte const* bytes,
                                                          Extent size) const noexcept
   {
     auto const nblocks = size / 4;
@@ -183,10 +183,14 @@ struct MurmurHash3_32 {
     // tail
     std::uint32_t k1 = 0;
     switch (size & 3) {
-      case 3: k1 ^= std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 2]) << 16; [[fallthrough]];
-      case 2: k1 ^= std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 1]) << 8; [[fallthrough]];
+      case 3:
+        k1 ^= cuda::std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 2]) << 16;
+        [[fallthrough]];
+      case 2:
+        k1 ^= cuda::std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 1]) << 8;
+        [[fallthrough]];
       case 1:
-        k1 ^= std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 0]);
+        k1 ^= cuda::std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 0]);
         k1 *= c1;
         k1 = rotl32(k1, 15);
         k1 *= c2;
@@ -247,7 +251,7 @@ struct MurmurHash3_x64_128 {
    */
   constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
   {
-    return compute_hash(reinterpret_cast<std::byte const*>(&key),
+    return compute_hash(reinterpret_cast<cuda::std::byte const*>(&key),
                         cuco::extent<std::size_t, sizeof(Key)>{});
   }
 
@@ -261,7 +265,7 @@ struct MurmurHash3_x64_128 {
    * @return The resulting hash value
    */
   template <typename Extent>
-  constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes,
+  constexpr result_type __host__ __device__ compute_hash(cuda::std::byte const* bytes,
                                                          Extent size) const noexcept
   {
     constexpr std::uint32_t block_size = 16;
@@ -390,7 +394,7 @@ struct MurmurHash3_x86_128 {
    */
   constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
   {
-    return compute_hash(reinterpret_cast<std::byte const*>(&key),
+    return compute_hash(reinterpret_cast<cuda::std::byte const*>(&key),
                         cuco::extent<std::size_t, sizeof(Key)>{});
   }
 
@@ -404,7 +408,7 @@ struct MurmurHash3_x86_128 {
    * @return The resulting hash value
    */
   template <typename Extent>
-  constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes,
+  constexpr result_type __host__ __device__ compute_hash(cuda::std::byte const* bytes,
                                                          Extent size) const noexcept
   {
     constexpr std::uint32_t block_size = 16;

diff --git a/include/cuco/detail/hash_functions/utils.cuh b/include/cuco/detail/hash_functions/utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,14 @@
 
 #pragma once
 
+#include <cuda/std/cstddef>
+
 namespace cuco::detail {
 
 template <typename T, typename U, typename Extent>
 constexpr __host__ __device__ T load_chunk(U const* const data, Extent index) noexcept
 {
-  auto const bytes = reinterpret_cast<std::byte const*>(data);
+  auto const bytes = reinterpret_cast<cuda::std::byte const*>(data);
   T chunk;
   memcpy(&chunk, bytes + index * sizeof(T), sizeof(T));
   return chunk;
@@ -37,4 +39,4 @@ constexpr __host__ __device__ std::uint64_t rotl64(std::uint64_t x, std::int8_t
   return (x << r) | (x >> (64 - r));
 }
 
-};  // namespace cuco::detail
+};  // namespace cuco::detail
diff --git a/include/cuco/detail/hash_functions/xxhash.cuh b/include/cuco/detail/hash_functions/xxhash.cuh
@@ -19,7 +19,8 @@
 #include <cuco/detail/hash_functions/utils.cuh>
 #include <cuco/extent.cuh>
 
-#include <cstddef>
+#include <cuda/std/cstddef>
+
 #include <cstdint>
 
 namespace cuco::detail {
@@ -91,10 +92,10 @@ struct XXHash_32 {
   {
     if constexpr (sizeof(Key) <= 16) {
       Key const key_copy = key;
-      return compute_hash(reinterpret_cast<std::byte const*>(&key_copy),
+      return compute_hash(reinterpret_cast<cuda::std::byte const*>(&key_copy),
                           cuco::extent<std::size_t, sizeof(Key)>{});
     } else {
-      return compute_hash(reinterpret_cast<std::byte const*>(&key),
+      return compute_hash(reinterpret_cast<cuda::std::byte const*>(&key),
                           cuco::extent<std::size_t, sizeof(Key)>{});
     }
   }
@@ -109,7 +110,7 @@ struct XXHash_32 {
    * @return The resulting hash value
    */
   template <typename Extent>
-  constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes,
+  constexpr result_type __host__ __device__ compute_hash(cuda::std::byte const* bytes,
                                                          Extent size) const noexcept
   {
     std::size_t offset = 0;
@@ -159,7 +160,7 @@ struct XXHash_32 {
     // the following loop is only needed if the size of the key is not a multiple of the block size
     if (size % 4) {
       while (offset < size) {
-        h32 += (std::to_integer<std::uint32_t>(bytes[offset]) & 255) * prime5;
+        h32 += (cuda::std::to_integer<std::uint32_t>(bytes[offset]) & 255) * prime5;
         h32 = rotl32(h32, 11) * prime1;
         ++offset;
       }
@@ -254,10 +255,10 @@ struct XXHash_64 {
   {
     if constexpr (sizeof(Key) <= 16) {
       Key const key_copy = key;
-      return compute_hash(reinterpret_cast<std::byte const*>(&key_copy),
+      return compute_hash(reinterpret_cast<cuda::std::byte const*>(&key_copy),
                           cuco::extent<std::size_t, sizeof(Key)>{});
     } else {
-      return compute_hash(reinterpret_cast<std::byte const*>(&key),
+      return compute_hash(reinterpret_cast<cuda::std::byte const*>(&key),
                           cuco::extent<std::size_t, sizeof(Key)>{});
     }
   }
@@ -272,7 +273,7 @@ struct XXHash_64 {
    * @return The resulting hash value
    */
   template <typename Extent>
-  constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes,
+  constexpr result_type __host__ __device__ compute_hash(cuda::std::byte const* bytes,
                                                          Extent size) const noexcept
   {
     std::size_t offset = 0;
@@ -357,7 +358,7 @@ struct XXHash_64 {
     // block size
     if (size % 4) {
       while (offset < size) {
-        h64 ^= (std::to_integer<std::uint32_t>(bytes[offset]) & 0xff) * prime5;
+        h64 ^= (cuda::std::to_integer<std::uint32_t>(bytes[offset]) & 0xff) * prime5;
         h64 = rotl64(h64, 11) * prime1;
         ++offset;
       }

diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -475,8 +475,8 @@ class hyperloglog_ref {
     cuco::sketch_size_kb sketch_size_kb) noexcept
   {
     // minimum precision is 4 or 64 bytes
-    return std::max(static_cast<std::size_t>(sizeof(register_type) * 1ull << 4),
-                    cuda::std::bit_floor(static_cast<std::size_t>(sketch_size_kb * 1024)));
+    return cuda::std::max(static_cast<std::size_t>(sizeof(register_type) * 1ull << 4),
+                          cuda::std::bit_floor(static_cast<std::size_t>(sketch_size_kb * 1024)));
   }
 
   /**
@@ -493,7 +493,7 @@ class hyperloglog_ref {
     // https://github.com/apache/spark/blob/6a27789ad7d59cd133653a49be0bb49729542abe/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/HyperLogLogPlusPlusHelper.scala#L43
 
     //  minimum precision is 4 or 64 bytes
-    auto const precision = std::max(
+    auto const precision = cuda::std::max(
       static_cast<int32_t>(4),
       static_cast<int32_t>(
         cuda::std::ceil(2.0 * cuda::std::log(1.106 / standard_deviation) / cuda::std::log(2.0))));

diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh
@@ -213,8 +213,8 @@ class open_addressing_ref_impl {
    *
    * @return The key equality predicate
    */
-  [[nodiscard]] __device__ constexpr detail::equal_wrapper<key_type, key_equal> const& predicate()
-    const noexcept
+  [[nodiscard]] __host__ __device__ constexpr detail::equal_wrapper<key_type, key_equal> const&
+  predicate() const noexcept
   {
     return this->predicate_;
   }
@@ -255,7 +255,7 @@ class open_addressing_ref_impl {
    *
    * @return The non-owning storage ref of the container
    */
-  [[nodiscard]] __device__ constexpr storage_ref_type const& storage_ref() const noexcept
+  [[nodiscard]] __host__ __device__ constexpr storage_ref_type const& storage_ref() const noexcept
   {
     return storage_ref_;
   }
@@ -1142,7 +1142,8 @@ class open_addressing_ref_impl {
    * @return The key
    */
   template <typename Value>
-  [[nodiscard]] __device__ constexpr auto const& extract_key(Value const& value) const noexcept
+  [[nodiscard]] __host__ __device__ constexpr auto const& extract_key(
+    Value const& value) const noexcept
   {
     if constexpr (this->has_payload) {
       return thrust::raw_reference_cast(value).first;

diff --git a/include/cuco/detail/probe_sequence_impl.cuh b/include/cuco/detail/probe_sequence_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ class probe_sequence_base {
    *
    * @return The number of elements loaded with each vector load
    */
-  static constexpr uint32_t vector_width() noexcept { return 2u; }
+  static __host__ __device__ constexpr uint32_t vector_width() noexcept { return 2u; }
 };
 
 /**
@@ -210,7 +210,7 @@ class linear_probing_impl
   __device__ __forceinline__ iterator
   initial_slot(cooperative_groups::thread_block_tile<cg_size> const& g, ProbeKey const& k) noexcept
   {
-    return const_cast<iterator>(std::as_const(*this).initial_slot(g, k));
+    return const_cast<iterator>(cuda::std::as_const(*this).initial_slot(g, k));
   }
 
   /**
@@ -257,7 +257,7 @@ class linear_probing_impl
    */
   __device__ __forceinline__ iterator next_slot(iterator s) noexcept
   {
-    return const_cast<iterator>(std::as_const(*this).next_slot(s));
+    return const_cast<iterator>(cuda::std::as_const(*this).next_slot(s));
   }
 
   /**
@@ -364,7 +364,7 @@ class double_hashing_impl
   __device__ __forceinline__ iterator
   initial_slot(cooperative_groups::thread_block_tile<cg_size> const& g, ProbeKey const& k) noexcept
   {
-    return const_cast<iterator>(std::as_const(*this).initial_slot(g, k));
+    return const_cast<iterator>(cuda::std::as_const(*this).initial_slot(g, k));
   }
 
   /**
@@ -409,7 +409,7 @@ class double_hashing_impl
    */
   __device__ __forceinline__ iterator next_slot(iterator s) noexcept
   {
-    return const_cast<iterator>(std::as_const(*this).next_slot(s));
+    return const_cast<iterator>(cuda::std::as_const(*this).next_slot(s));
   }
 
   /**