NVIDIA · srinivasyadav18 · Nov 4, 2025 · Nov 3, 2025 · Nov 3, 2025
@@ -469,7 +469,12 @@ public:
     constexpr auto float_double_plus =
       gpu_gpu_determinism && detail::is_one_of_v<accum_t, float, double> && detail::is_cuda_std_plus_v<ReductionOpT>;
 
-    constexpr auto supported = integral_fallback || float_double_plus || !gpu_gpu_determinism;
+    constexpr auto float_double_min_max_fallback =
+      gpu_gpu_determinism
+      && detail::is_one_of_v<accum_t, float, double> && detail::is_cuda_minimum_maximum_v<ReductionOpT>;
+
+    constexpr auto supported =
+      integral_fallback || float_double_plus || float_double_min_max_fallback || !gpu_gpu_determinism;
 
     // gpu_to_gpu determinism is only supported for integral types with cuda operators, or
     // float and double types with ::cuda::std::plus operator
@@ -496,7 +501,7 @@ public:
       // If the conditions for gpu-to-gpu determinism or non-deterministic
       // reduction are not met, we fall back to run-to-run determinism.
       using determinism_t = ::cuda::std::conditional_t<
-        (gpu_gpu_determinism && integral_fallback)
+        (gpu_gpu_determinism && (integral_fallback || float_double_min_max_fallback))
           || (no_determinism && !(is_contiguous_fallback && is_plus_fallback && is_4b_or_greater)),
         ::cuda::execution::determinism::run_to_run_t,
         default_determinism_t>;

@@ -329,7 +329,9 @@ C2H_TEST("Deterministic Device reduce works with float and double on gpu with di
 }
 
 using test_types =
-  c2h::type_list<int8_t,
+  c2h::type_list<float,
+                 double,
+                 int8_t,
                  int32_t,
                  int64_t
 #if TEST_INT128()
@@ -356,75 +358,96 @@ C2H_TEST("Deterministic Device reduce works with integral types on gpu with diff
   c2h::device_vector<type> d_input(num_items);
   c2h::gen(C2H_SEED(2), d_input, min_value, max_value);
 
-  SECTION("plus")
+  if constexpr (::cuda::std::is_integral_v<type>)
   {
-    c2h::device_vector<type> d_output(1);
+    SECTION("plus")
+    {
+      c2h::device_vector<type> d_output(1);
 
-    auto error =
-      cub::DeviceReduce::Reduce(d_input.begin(), d_output.begin(), num_items, cuda::std::plus<type>{}, init_t{}, env);
-    REQUIRE(error == cudaSuccess);
+      auto error =
+        cub::DeviceReduce::Reduce(d_input.begin(), d_output.begin(), num_items, cuda::std::plus<type>{}, init_t{}, env);
+      REQUIRE(error == cudaSuccess);
 
-    c2h::host_vector<type> h_input = d_input;
+      c2h::host_vector<type> h_input = d_input;
 
-    c2h::host_vector<type> h_expected(1);
-    // Requires `std::accumulate` to produce deterministic result which is required for comparison
-    // with the device RFA result.
-    // NOTE: `std::reduce` is not equivalent
-    h_expected[0] = std::accumulate(h_input.begin(), h_input.end(), init_t{}, cuda::std::plus<type>{});
+      c2h::host_vector<type> h_expected(1);
+      // Requires `std::accumulate` to produce deterministic result which is required for comparison
+      // with the device RFA result.
+      // NOTE: `std::reduce` is not equivalent
+      h_expected[0] = std::accumulate(h_input.begin(), h_input.end(), init_t{}, cuda::std::plus<type>{});
 
-    c2h::host_vector<type> h_output = d_output;
-    REQUIRE(h_expected == h_output);
-  }
+      c2h::host_vector<type> h_output = d_output;
+      REQUIRE(h_expected == h_output);
+    }
 
-  SECTION("minimum")
-  {
-    c2h::device_vector<type> d_output(1);
+    SECTION("bitwise xor")
+    {
+      c2h::device_vector<type> d_output(1);
 
-    init_t init_value{cuda::std::numeric_limits<init_t>::max()};
+      init_t init_value{};
 
-    auto error =
-      cub::DeviceReduce::Reduce(d_input.begin(), d_output.begin(), num_items, cuda::minimum<init_t>{}, init_value, env);
-    REQUIRE(error == cudaSuccess);
+      auto error = cub::DeviceReduce::Reduce(
+        d_input.begin(), d_output.begin(), num_items, cuda::std::bit_xor<>{}, init_value, env);
+      REQUIRE(error == cudaSuccess);
 
-    c2h::host_vector<type> h_input = d_input;
-    c2h::host_vector<type> h_expected(1);
-    h_expected[0] = std::accumulate(h_input.begin(), h_input.end(), type{init_value}, cuda::minimum<>{});
+      c2h::host_vector<type> h_input = d_input;
+      c2h::host_vector<type> h_expected(1);
+      h_expected[0] = std::accumulate(h_input.begin(), h_input.end(), type{init_value}, cuda::std::bit_xor<type>{});
 
-    c2h::host_vector<type> h_output = d_output;
-    REQUIRE(h_expected == h_output);
+      c2h::host_vector<type> h_output = d_output;
+      REQUIRE(h_expected == h_output);
+    }
+
+    SECTION("logical or")
+    {
+      c2h::device_vector<type> d_output(1);
+
+      init_t init_value{};
+
+      auto error = cub::DeviceReduce::Reduce(
+        d_input.begin(), d_output.begin(), num_items, cuda::std::logical_or<>{}, init_value, env);
+      REQUIRE(error == cudaSuccess);
+
+      c2h::host_vector<type> h_input = d_input;
+      c2h::host_vector<type> h_expected(1);
+      h_expected[0] = std::accumulate(h_input.begin(), h_input.end(), type{init_value}, cuda::std::logical_or<>{});
+
+      c2h::host_vector<type> h_output = d_output;
+      REQUIRE(h_expected == h_output);
+    }
   }
 
-  SECTION("bitwise xor")
+  SECTION("minimum")
   {
     c2h::device_vector<type> d_output(1);
 
-    init_t init_value{};
+    init_t init_value{cuda::std::numeric_limits<init_t>::max()};
 
     auto error =
-      cub::DeviceReduce::Reduce(d_input.begin(), d_output.begin(), num_items, cuda::std::bit_xor<>{}, init_value, env);
+      cub::DeviceReduce::Reduce(d_input.begin(), d_output.begin(), num_items, cuda::minimum<init_t>{}, init_value, env);
     REQUIRE(error == cudaSuccess);
 
     c2h::host_vector<type> h_input = d_input;
     c2h::host_vector<type> h_expected(1);
-    h_expected[0] = std::accumulate(h_input.begin(), h_input.end(), type{init_value}, cuda::std::bit_xor<type>{});
+    h_expected[0] = std::accumulate(h_input.begin(), h_input.end(), type{init_value}, cuda::minimum<>{});
 
     c2h::host_vector<type> h_output = d_output;
     REQUIRE(h_expected == h_output);
   }
 
-  SECTION("logical or")
+  SECTION("maximum")
   {
     c2h::device_vector<type> d_output(1);
 
-    init_t init_value{};
+    init_t init_value{cuda::std::numeric_limits<init_t>::min()};
 
-    auto error = cub::DeviceReduce::Reduce(
-      d_input.begin(), d_output.begin(), num_items, cuda::std::logical_or<>{}, init_value, env);
+    auto error =
+      cub::DeviceReduce::Reduce(d_input.begin(), d_output.begin(), num_items, cuda::maximum<init_t>{}, init_value, env);
     REQUIRE(error == cudaSuccess);
 
     c2h::host_vector<type> h_input = d_input;
     c2h::host_vector<type> h_expected(1);
-    h_expected[0] = std::accumulate(h_input.begin(), h_input.end(), type{init_value}, cuda::std::logical_or<>{});
+    h_expected[0] = std::accumulate(h_input.begin(), h_input.end(), type{init_value}, cuda::maximum<>{});
 
     c2h::host_vector<type> h_output = d_output;
     REQUIRE(h_expected == h_output);