Add WarpReduce Device-Side Benchmarks (#6431)

fbusato · bernhardmgruber · web-flow · commit 5466563dc2a3 · 2025-11-05T00:52:09.000Z
Co-authored-by: Bernhard Manfred Gruber &lt;bernhardmgruber@gmail.com&gt;
diff --git a/cub/benchmarks/bench/reduce/warp_reduce_base.cuh b/cub/benchmarks/bench/reduce/warp_reduce_base.cuh
@@ -0,0 +1,41 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#include <cuda_runtime_api.h>
+#include <device_side_benchmark.cuh>
+#include <nvbench_helper.cuh>
+
+struct benchmark_op_t
+{
+  template <typename T>
+  __device__ __forceinline__ T operator()(T thread_data) const
+  {
+    using WarpReduce  = cub::WarpReduce<T>;
+    using TempStorage = typename WarpReduce::TempStorage;
+    __shared__ TempStorage temp_storage[32];
+    auto warp_id = threadIdx.x / 32;
+    return WarpReduce{temp_storage[warp_id]}.Reduce(thread_data, op_t{});
+  }
+};
+
+template <typename T>
+void warp_reduce(nvbench::state& state, nvbench::type_list<T>)
+{
+  constexpr int block_size    = 256;
+  constexpr int unroll_factor = 128; // compromise between compile time and noise
+  const auto& kernel          = benchmark_kernel<block_size, unroll_factor, benchmark_op_t, T>;
+  const int num_SMs           = state.get_device().value().get_number_of_sms();
+  const int device            = state.get_device().value().get_id();
+  int max_blocks_per_SM       = 0;
+  NVBENCH_CUDA_CALL_NOEXCEPT(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_SM, kernel, block_size, 0));
+  const int grid_size = max_blocks_per_SM * num_SMs;
+  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch&) {
+    kernel<<<grid_size, block_size>>>(benchmark_op_t{});
+  });
+}
+
+NVBENCH_BENCH_TYPES(warp_reduce, NVBENCH_TYPE_AXES(value_types)).set_name("warp_reduce").set_type_axes_names({"T{ct}"});
diff --git a/cub/benchmarks/bench/reduce/warp_reduce_min.cu b/cub/benchmarks/bench/reduce/warp_reduce_min.cu
@@ -0,0 +1,24 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <nvbench_helper.cuh>
+
+// complex types cannot be compared with operator<
+using value_types =
+  nvbench::type_list<int8_t,
+                     int16_t,
+                     int32_t,
+#if NVBENCH_HELPER_HAS_I128
+                     int128_t,
+#endif
+#if _CCCL_HAS_NVFP16() && _CCCL_CTK_AT_LEAST(12, 2)
+                     __half,
+#endif
+#if _CCCL_HAS_NVBF16() && _CCCL_CTK_AT_LEAST(12, 2)
+                     __nv_bfloat16,
+#endif
+                     float,
+                     double>;
+
+using op_t = ::cuda::minimum<>;
+#include "warp_reduce_base.cuh"
diff --git a/cub/benchmarks/bench/reduce/warp_reduce_sum.cu b/cub/benchmarks/bench/reduce/warp_reduce_sum.cu
@@ -0,0 +1,31 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <nvbench_helper.cuh>
+
+using value_types = nvbench::type_list<
+  int8_t,
+  int16_t,
+  int32_t,
+#if NVBENCH_HELPER_HAS_I128
+  int128_t,
+#endif
+#if _CCCL_HAS_NVFP16() && _CCCL_CTK_AT_LEAST(12, 2)
+  __half,
+#endif
+#if _CCCL_HAS_NVBF16() && _CCCL_CTK_AT_LEAST(12, 2)
+  __nv_bfloat16,
+#endif
+  float,
+  double,
+#if _CCCL_HAS_NVFP16() && _CCCL_CTK_AT_LEAST(12, 2)
+  cuda::std::complex<__half>,
+#endif
+#if _CCCL_HAS_NVBF16() && _CCCL_CTK_AT_LEAST(12, 2)
+  cuda::std::complex<__nv_bfloat16>,
+#endif
+  cuda::std::complex<float>,
+  cuda::std::complex<double>>;
+
+using op_t = ::cuda::std::plus<>;
+#include "warp_reduce_base.cuh"
diff --git a/nvbench_helper/nvbench_helper/device_side_benchmark.cuh b/nvbench_helper/nvbench_helper/device_side_benchmark.cuh
@@ -0,0 +1,43 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <cuda/cmath>
+#include <cuda/ptx>
+#include <cuda/std/cstdint>
+#include <cuda/std/cstring>
+#include <cuda/utility>
+
+template <typename T>
+__device__ __forceinline__ static T generate_random_data()
+{
+  constexpr auto size = cuda::ceil_div(sizeof(T), sizeof(uint32_t));
+  uint32_t data[size];
+  for (int i = 0; i < size; i++)
+  {
+    data[i] = cuda::ptx::get_sreg_clock();
+  }
+  T ret;
+  ::cuda::std::memcpy(&ret, data, sizeof(T));
+  return ret;
+}
+
+__device__ static int device_var[16];
+
+template <typename T>
+__device__ __forceinline__ static void sink(T value)
+{
+  if (cuda::ptx::get_sreg_smid() == static_cast<uint32_t>(-1))
+  {
+    *reinterpret_cast<T*>(device_var) = value;
+  }
+}
+
+template <int BlockThreads, int UnrollFactor, typename ActionT, typename T>
+__launch_bounds__(BlockThreads) __global__ static void benchmark_kernel(_CCCL_GRID_CONSTANT const ActionT action)
+{
+  auto data = generate_random_data<T>();
+  cuda::static_for<UnrollFactor>([&]([[maybe_unused]] auto _) {
+    data = action(data);
+  });
+  sink(data);
+}
diff --git a/nvbench_helper/nvbench_helper/nvbench_helper.cuh b/nvbench_helper/nvbench_helper/nvbench_helper.cuh
@@ -33,7 +33,17 @@ NVBENCH_DECLARE_TYPE_STRINGS(uint128_t, "U128", "uint128_t");
 
 using complex = cuda::std::complex<float>;
 
-NVBENCH_DECLARE_TYPE_STRINGS(complex, "C64", "complex");
+#if _CCCL_HAS_NVFP16()
+NVBENCH_DECLARE_TYPE_STRINGS(__half, "Half", "half");
+NVBENCH_DECLARE_TYPE_STRINGS(cuda::std::complex<__half>, "C16", "complex_half");
+#endif
+#if _CCCL_HAS_NVBF16()
+NVBENCH_DECLARE_TYPE_STRINGS(__nv_bfloat16, "Bfloat16", "bfloat16");
+NVBENCH_DECLARE_TYPE_STRINGS(cuda::std::complex<__nv_bfloat16>, "CB16", "complex_bfloat16");
+#endif
+NVBENCH_DECLARE_TYPE_STRINGS(complex, "C32", "complex32");
+NVBENCH_DECLARE_TYPE_STRINGS(cuda::std::complex<double>, "C64", "complex64");
+
 NVBENCH_DECLARE_TYPE_STRINGS(::cuda::std::false_type, "false", "false_type");
 NVBENCH_DECLARE_TYPE_STRINGS(::cuda::std::true_type, "true", "true_type");
 NVBENCH_DECLARE_TYPE_STRINGS(cub::ArgMin, "ArgMin", "cub::ArgMin");