Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions cub/cub/device/device_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,12 @@ public:
constexpr auto float_double_plus =
gpu_gpu_determinism && detail::is_one_of_v<accum_t, float, double> && detail::is_cuda_std_plus_v<ReductionOpT>;

constexpr auto supported = integral_fallback || float_double_plus || !gpu_gpu_determinism;
constexpr auto float_double_min_max_fallback =
gpu_gpu_determinism
&& detail::is_one_of_v<accum_t, float, double> && detail::is_cuda_minimum_maximum_v<ReductionOpT>;

constexpr auto supported =
integral_fallback || float_double_plus || float_double_min_max_fallback || !gpu_gpu_determinism;

// gpu_to_gpu determinism is only supported for integral types with cuda operators, or
// float and double types with ::cuda::std::plus operator
Expand All @@ -496,7 +501,7 @@ public:
// If the conditions for gpu-to-gpu determinism or non-deterministic
// reduction are not met, we fall back to run-to-run determinism.
using determinism_t = ::cuda::std::conditional_t<
(gpu_gpu_determinism && integral_fallback)
(gpu_gpu_determinism && (integral_fallback || float_double_min_max_fallback))
|| (no_determinism && !(is_contiguous_fallback && is_plus_fallback && is_4b_or_greater)),
::cuda::execution::determinism::run_to_run_t,
default_determinism_t>;
Expand Down
95 changes: 59 additions & 36 deletions cub/test/catch2_test_device_reduce_deterministic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,9 @@ C2H_TEST("Deterministic Device reduce works with float and double on gpu with di
}

using test_types =
c2h::type_list<int8_t,
c2h::type_list<float,
double,
int8_t,
int32_t,
int64_t
#if TEST_INT128()
Expand All @@ -356,75 +358,96 @@ C2H_TEST("Deterministic Device reduce works with integral types on gpu with diff
c2h::device_vector<type> d_input(num_items);
c2h::gen(C2H_SEED(2), d_input, min_value, max_value);

SECTION("plus")
if constexpr (::cuda::std::is_integral_v<type>)
{
c2h::device_vector<type> d_output(1);
SECTION("plus")
{
c2h::device_vector<type> d_output(1);

auto error =
cub::DeviceReduce::Reduce(d_input.begin(), d_output.begin(), num_items, cuda::std::plus<type>{}, init_t{}, env);
REQUIRE(error == cudaSuccess);
auto error =
cub::DeviceReduce::Reduce(d_input.begin(), d_output.begin(), num_items, cuda::std::plus<type>{}, init_t{}, env);
REQUIRE(error == cudaSuccess);

c2h::host_vector<type> h_input = d_input;
c2h::host_vector<type> h_input = d_input;

c2h::host_vector<type> h_expected(1);
// Requires `std::accumulate` to produce deterministic result which is required for comparison
// with the device RFA result.
// NOTE: `std::reduce` is not equivalent
h_expected[0] = std::accumulate(h_input.begin(), h_input.end(), init_t{}, cuda::std::plus<type>{});
c2h::host_vector<type> h_expected(1);
// Requires `std::accumulate` to produce deterministic result which is required for comparison
// with the device RFA result.
// NOTE: `std::reduce` is not equivalent
h_expected[0] = std::accumulate(h_input.begin(), h_input.end(), init_t{}, cuda::std::plus<type>{});

c2h::host_vector<type> h_output = d_output;
REQUIRE(h_expected == h_output);
}
c2h::host_vector<type> h_output = d_output;
REQUIRE(h_expected == h_output);
}

SECTION("minimum")
{
c2h::device_vector<type> d_output(1);
SECTION("bitwise xor")
{
c2h::device_vector<type> d_output(1);

init_t init_value{cuda::std::numeric_limits<init_t>::max()};
init_t init_value{};

auto error =
cub::DeviceReduce::Reduce(d_input.begin(), d_output.begin(), num_items, cuda::minimum<init_t>{}, init_value, env);
REQUIRE(error == cudaSuccess);
auto error = cub::DeviceReduce::Reduce(
d_input.begin(), d_output.begin(), num_items, cuda::std::bit_xor<>{}, init_value, env);
REQUIRE(error == cudaSuccess);

c2h::host_vector<type> h_input = d_input;
c2h::host_vector<type> h_expected(1);
h_expected[0] = std::accumulate(h_input.begin(), h_input.end(), type{init_value}, cuda::minimum<>{});
c2h::host_vector<type> h_input = d_input;
c2h::host_vector<type> h_expected(1);
h_expected[0] = std::accumulate(h_input.begin(), h_input.end(), type{init_value}, cuda::std::bit_xor<type>{});

c2h::host_vector<type> h_output = d_output;
REQUIRE(h_expected == h_output);
c2h::host_vector<type> h_output = d_output;
REQUIRE(h_expected == h_output);
}

SECTION("logical or")
{
c2h::device_vector<type> d_output(1);

init_t init_value{};

auto error = cub::DeviceReduce::Reduce(
d_input.begin(), d_output.begin(), num_items, cuda::std::logical_or<>{}, init_value, env);
REQUIRE(error == cudaSuccess);

c2h::host_vector<type> h_input = d_input;
c2h::host_vector<type> h_expected(1);
h_expected[0] = std::accumulate(h_input.begin(), h_input.end(), type{init_value}, cuda::std::logical_or<>{});

c2h::host_vector<type> h_output = d_output;
REQUIRE(h_expected == h_output);
}
}

SECTION("bitwise xor")
SECTION("minimum")
{
c2h::device_vector<type> d_output(1);

init_t init_value{};
init_t init_value{cuda::std::numeric_limits<init_t>::max()};

auto error =
cub::DeviceReduce::Reduce(d_input.begin(), d_output.begin(), num_items, cuda::std::bit_xor<>{}, init_value, env);
cub::DeviceReduce::Reduce(d_input.begin(), d_output.begin(), num_items, cuda::minimum<init_t>{}, init_value, env);
REQUIRE(error == cudaSuccess);

c2h::host_vector<type> h_input = d_input;
c2h::host_vector<type> h_expected(1);
h_expected[0] = std::accumulate(h_input.begin(), h_input.end(), type{init_value}, cuda::std::bit_xor<type>{});
h_expected[0] = std::accumulate(h_input.begin(), h_input.end(), type{init_value}, cuda::minimum<>{});

c2h::host_vector<type> h_output = d_output;
REQUIRE(h_expected == h_output);
}

SECTION("logical or")
SECTION("maximum")
{
c2h::device_vector<type> d_output(1);

init_t init_value{};
init_t init_value{cuda::std::numeric_limits<init_t>::min()};

auto error = cub::DeviceReduce::Reduce(
d_input.begin(), d_output.begin(), num_items, cuda::std::logical_or<>{}, init_value, env);
auto error =
cub::DeviceReduce::Reduce(d_input.begin(), d_output.begin(), num_items, cuda::maximum<init_t>{}, init_value, env);
REQUIRE(error == cudaSuccess);

c2h::host_vector<type> h_input = d_input;
c2h::host_vector<type> h_expected(1);
h_expected[0] = std::accumulate(h_input.begin(), h_input.end(), type{init_value}, cuda::std::logical_or<>{});
h_expected[0] = std::accumulate(h_input.begin(), h_input.end(), type{init_value}, cuda::maximum<>{});

c2h::host_vector<type> h_output = d_output;
REQUIRE(h_expected == h_output);
Expand Down