Skip to content

Commit

Permalink
Clean up detail utility functions (#359)
Browse files Browse the repository at this point in the history
This PR

- adds `grid_stride` and `global_thread_id` utilities in
`detail/utility/cuda.cuh` (inspired by similar work in libcudf)
- adds `compute_grid_size` helper function in `detail/utility/cuda.hpp`
- renames `SDIV` as `ceiling_div` and moves it to
`detail/utility/math.hpp`

---------

Co-authored-by: Daniel Jünger <[email protected]>
  • Loading branch information
PointKernel and sleeepyjack authored Sep 5, 2023
1 parent 9b4ebaf commit 7c76a12
Show file tree
Hide file tree
Showing 16 changed files with 197 additions and 161 deletions.
5 changes: 2 additions & 3 deletions benchmarks/hash_bench.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

#include <defaults.hpp>

#include <cuco/detail/utils.hpp>
#include <cuco/hash_functions.cuh>

#include <nvbench/nvbench.cuh>
Expand Down Expand Up @@ -70,7 +69,7 @@ void hash_eval(nvbench::state& state, nvbench::type_list<Hash>)
bool const materialize_result = false;
constexpr auto block_size = 128;
auto const num_keys = state.get_int64_or_default("NumInputs", cuco::benchmark::defaults::N * 10);
auto const grid_size = SDIV(num_keys, block_size * 16);
auto const grid_size = (num_keys + block_size * 16 - 1) / block_size * 16;

thrust::device_vector<typename Hash::result_type> hash_values((materialize_result) ? num_keys
: 1);
Expand Down Expand Up @@ -98,4 +97,4 @@ NVBENCH_BENCH_TYPES(
cuco::murmurhash3_fmix_64<nvbench::int64_t>>))
.set_name("hash_function_eval")
.set_type_axes_names({"Hash"})
.set_max_noise(cuco::benchmark::defaults::MAX_NOISE);
.set_max_noise(cuco::benchmark::defaults::MAX_NOISE);
23 changes: 11 additions & 12 deletions include/cuco/detail/common_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
*/
#pragma once

#include <cuco/detail/utils.hpp>
#include <cuco/detail/utility/cuda.cuh>

#include <cub/block/block_reduce.cuh>

Expand Down Expand Up @@ -71,8 +71,8 @@ __global__ void insert_if_n(InputIterator first,
__shared__ typename BlockReduce::TempStorage temp_storage;
typename Ref::size_type thread_num_successes = 0;

cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize;
cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize;
auto const loop_stride = cuco::detail::grid_stride() / CGSize;
auto idx = cuco::detail::global_thread_id() / CGSize;

while (idx < n) {
if (pred(*(stencil + idx))) {
Expand Down Expand Up @@ -129,8 +129,8 @@ template <int32_t CGSize,
__global__ void insert_if_n(
InputIterator first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref)
{
cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize;
cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize;
auto const loop_stride = cuco::detail::grid_stride() / CGSize;
auto idx = cuco::detail::global_thread_id() / CGSize;

while (idx < n) {
if (pred(*(stencil + idx))) {
Expand Down Expand Up @@ -188,11 +188,10 @@ __global__ void contains_if_n(InputIt first,
{
namespace cg = cooperative_groups;

auto const block = cg::this_thread_block();
auto const thread_idx = block.thread_rank();

cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize;
cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize;
auto const block = cg::this_thread_block();
auto const thread_idx = block.thread_rank();
auto const loop_stride = cuco::detail::grid_stride() / CGSize;
auto idx = cuco::detail::global_thread_id() / CGSize;

__shared__ bool output_buffer[BlockSize / CGSize];

Expand Down Expand Up @@ -239,8 +238,8 @@ __global__ void size(StorageRef storage, Predicate is_filled, AtomicT* count)
{
using size_type = typename StorageRef::size_type;

cuco::detail::index_type const loop_stride = gridDim.x * BlockSize;
cuco::detail::index_type idx = BlockSize * blockIdx.x + threadIdx.x;
auto const loop_stride = cuco::detail::grid_stride();
auto idx = cuco::detail::global_thread_id();

size_type thread_count = 0;
auto const n = storage.num_windows();
Expand Down
5 changes: 3 additions & 2 deletions include/cuco/detail/extent/extent.inl
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <cuco/detail/error.hpp>
#include <cuco/detail/prime.hpp> // TODO move to detail/extent/
#include <cuco/detail/utility/math.hpp>
#include <cuco/detail/utils.hpp>
#include <cuco/utility/fast_int.cuh>

Expand Down Expand Up @@ -80,8 +81,8 @@ template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N>
(static_cast<uint64_t>(std::numeric_limits<SizeType>::max()) < max_prime)
? std::numeric_limits<SizeType>::max()
: static_cast<SizeType>(max_prime);
auto const size =
SDIV(std::max(static_cast<SizeType>(ext), static_cast<SizeType>(1)), CGSize * WindowSize);
auto const size = cuco::detail::int_div_ceil(
std::max(static_cast<SizeType>(ext), static_cast<SizeType>(1)), CGSize * WindowSize);
if (size > max_value) { CUCO_FAIL("Invalid input extent"); }

if constexpr (N == dynamic_extent) {
Expand Down
58 changes: 22 additions & 36 deletions include/cuco/detail/open_addressing_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#include <cuco/detail/common_functors.cuh>
#include <cuco/detail/common_kernels.cuh>
#include <cuco/detail/storage/counter_storage.cuh>
#include <cuco/detail/tuning.cuh>
#include <cuco/detail/utility/cuda.hpp>
#include <cuco/extent.cuh>
#include <cuco/probing_scheme.cuh>
#include <cuco/storage.cuh>
Expand Down Expand Up @@ -187,13 +187,11 @@ class open_addressing_impl {
detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator()};
counter.reset(stream);

auto const grid_size =
(cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) /
(detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE);
auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

auto const always_true = thrust::constant_iterator<bool>{true};
detail::insert_if_n<cg_size, detail::CUCO_DEFAULT_BLOCK_SIZE>
<<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(
detail::insert_if_n<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
first, num_keys, always_true, thrust::identity{}, counter.data(), container_ref);

return counter.load_to_host(stream);
Expand All @@ -218,13 +216,11 @@ class open_addressing_impl {
auto const num_keys = cuco::detail::distance(first, last);
if (num_keys == 0) { return; }

auto const grid_size =
(cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) /
(detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE);
auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

auto const always_true = thrust::constant_iterator<bool>{true};
detail::insert_if_n<cg_size, detail::CUCO_DEFAULT_BLOCK_SIZE>
<<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(
detail::insert_if_n<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
first, num_keys, always_true, thrust::identity{}, container_ref);
}

Expand Down Expand Up @@ -269,12 +265,10 @@ class open_addressing_impl {
detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator()};
counter.reset(stream);

auto const grid_size =
(cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) /
(detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE);
auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

detail::insert_if_n<cg_size, detail::CUCO_DEFAULT_BLOCK_SIZE>
<<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(
detail::insert_if_n<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
first, num_keys, stencil, pred, counter.data(), container_ref);

return counter.load_to_host(stream);
Expand Down Expand Up @@ -313,12 +307,10 @@ class open_addressing_impl {
auto const num_keys = cuco::detail::distance(first, last);
if (num_keys == 0) { return; }

auto const grid_size =
(cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) /
(detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE);
auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

detail::insert_if_n<cg_size, detail::CUCO_DEFAULT_BLOCK_SIZE>
<<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(
detail::insert_if_n<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
first, num_keys, stencil, pred, container_ref);
}

Expand Down Expand Up @@ -346,13 +338,11 @@ class open_addressing_impl {
auto const num_keys = cuco::detail::distance(first, last);
if (num_keys == 0) { return; }

auto const grid_size =
(cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) /
(detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE);
auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

auto const always_true = thrust::constant_iterator<bool>{true};
detail::contains_if_n<cg_size, detail::CUCO_DEFAULT_BLOCK_SIZE>
<<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(
detail::contains_if_n<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
first, num_keys, always_true, thrust::identity{}, output_begin, container_ref);
}

Expand Down Expand Up @@ -397,12 +387,10 @@ class open_addressing_impl {
auto const num_keys = cuco::detail::distance(first, last);
if (num_keys == 0) { return; }

auto const grid_size =
(cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) /
(detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE);
auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

detail::contains_if_n<cg_size, detail::CUCO_DEFAULT_BLOCK_SIZE>
<<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(
detail::contains_if_n<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
first, num_keys, stencil, pred, output_begin, container_ref);
}

Expand Down Expand Up @@ -489,14 +477,12 @@ class open_addressing_impl {
detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator()};
counter.reset(stream);

auto const grid_size =
(storage_.num_windows() + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) /
(detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE);
auto const grid_size = cuco::detail::grid_size(storage_.num_windows());

// TODO: custom kernel to be replaced by cub::DeviceReduce::Sum when cub version is bumped to
// v2.1.0
detail::size<detail::CUCO_DEFAULT_BLOCK_SIZE>
<<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(
detail::size<cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
storage_.ref(), is_filled, counter.data());

return counter.load_to_host(stream);
Expand Down
4 changes: 2 additions & 2 deletions include/cuco/detail/prime.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#pragma once

#include <cuco/detail/utils.hpp>
#include <cuco/detail/utility/math.hpp>

#include <algorithm>
#include <array>
Expand Down Expand Up @@ -20154,7 +20154,7 @@ constexpr T get_valid_capacity(T capacity) noexcept
if constexpr (not uses_vector_load) { return cg_size; }
}();

auto const c = SDIV(capacity, stride);
auto const c = int_div_ceil(capacity, stride);
auto const min_prime = std::lower_bound(primes.begin(), primes.end(), c);
return *min_prime * stride;
}
Expand Down
14 changes: 7 additions & 7 deletions include/cuco/detail/static_map/kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#pragma once

#include <cuco/detail/bitwise_compare.cuh>
#include <cuco/detail/utils.hpp>
#include <cuco/detail/utility/cuda.cuh>

#include <cub/block/block_reduce.cuh>

Expand Down Expand Up @@ -50,8 +50,8 @@ namespace detail {
template <int32_t CGSize, int32_t BlockSize, typename InputIterator, typename Ref>
__global__ void insert_or_assign(InputIterator first, cuco::detail::index_type n, Ref ref)
{
cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize;
cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize;
auto const loop_stride = cuco::detail::grid_stride() / CGSize;
auto idx = cuco::detail::global_thread_id() / CGSize;

while (idx < n) {
typename Ref::value_type const insert_pair{*(first + idx)};
Expand Down Expand Up @@ -91,11 +91,11 @@ __global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_
{
namespace cg = cooperative_groups;

auto const block = cg::this_thread_block();
auto const thread_idx = block.thread_rank();
auto const block = cg::this_thread_block();
auto const thread_idx = block.thread_rank();
auto const loop_stride = cuco::detail::grid_stride() / CGSize;
auto idx = cuco::detail::global_thread_id() / CGSize;

cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize;
cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize;
__shared__ typename Ref::mapped_type output_buffer[BlockSize / CGSize];

while (idx - thread_idx < n) { // the whole thread block falls into the same iteration
Expand Down
18 changes: 7 additions & 11 deletions include/cuco/detail/static_map/static_map.inl
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#include <cuco/cuda_stream_ref.hpp>
#include <cuco/detail/static_map/functors.cuh>
#include <cuco/detail/static_map/kernels.cuh>
#include <cuco/detail/tuning.cuh>
#include <cuco/detail/utility/cuda.hpp>
#include <cuco/detail/utils.hpp>
#include <cuco/operator.hpp>
#include <cuco/static_map_ref.cuh>
Expand Down Expand Up @@ -176,12 +176,10 @@ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stora
auto const num = cuco::detail::distance(first, last);
if (num == 0) { return; }

auto const grid_size =
(cg_size * num + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) /
(detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE);
auto const grid_size = cuco::detail::grid_size(num, cg_size);

static_map_ns::detail::insert_or_assign<cg_size, detail::CUCO_DEFAULT_BLOCK_SIZE>
<<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(
static_map_ns::detail::insert_or_assign<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
first, num, ref(op::insert_or_assign));
}

Expand Down Expand Up @@ -288,12 +286,10 @@ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stora
auto const num_keys = cuco::detail::distance(first, last);
if (num_keys == 0) { return; }

auto const grid_size =
(cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) /
(detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE);
auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

static_map_ns::detail::find<cg_size, detail::CUCO_DEFAULT_BLOCK_SIZE>
<<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(
static_map_ns::detail::find<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
first, num_keys, output_begin, ref(op::find));
}

Expand Down
Loading

0 comments on commit 7c76a12

Please sign in to comment.