Skip to content

Commit

Permalink
Merge branch 'trie-bitvector' of github.com:amukkara/cuCollections in…
Browse files Browse the repository at this point in the history
…to trie-bitvector
  • Loading branch information
amukkara committed Sep 6, 2023
2 parents a629730 + 37ebd0c commit cedc5d4
Show file tree
Hide file tree
Showing 27 changed files with 698 additions and 244 deletions.
5 changes: 2 additions & 3 deletions benchmarks/hash_bench.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

#include <defaults.hpp>

#include <cuco/detail/utils.hpp>
#include <cuco/hash_functions.cuh>

#include <nvbench/nvbench.cuh>
Expand Down Expand Up @@ -70,7 +69,7 @@ void hash_eval(nvbench::state& state, nvbench::type_list<Hash>)
bool const materialize_result = false;
constexpr auto block_size = 128;
auto const num_keys = state.get_int64_or_default("NumInputs", cuco::benchmark::defaults::N * 10);
auto const grid_size = SDIV(num_keys, block_size * 16);
auto const grid_size = (num_keys + block_size * 16 - 1) / block_size * 16;

thrust::device_vector<typename Hash::result_type> hash_values((materialize_result) ? num_keys
: 1);
Expand Down Expand Up @@ -98,4 +97,4 @@ NVBENCH_BENCH_TYPES(
cuco::murmurhash3_fmix_64<nvbench::int64_t>>))
.set_name("hash_function_eval")
.set_type_axes_names({"Hash"})
.set_max_noise(cuco::benchmark::defaults::MAX_NOISE);
.set_max_noise(cuco::benchmark::defaults::MAX_NOISE);
23 changes: 11 additions & 12 deletions include/cuco/detail/common_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
*/
#pragma once

#include <cuco/detail/utils.hpp>
#include <cuco/detail/utility/cuda.cuh>

#include <cub/block/block_reduce.cuh>

Expand Down Expand Up @@ -71,8 +71,8 @@ __global__ void insert_if_n(InputIterator first,
__shared__ typename BlockReduce::TempStorage temp_storage;
typename Ref::size_type thread_num_successes = 0;

cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize;
cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize;
auto const loop_stride = cuco::detail::grid_stride() / CGSize;
auto idx = cuco::detail::global_thread_id() / CGSize;

while (idx < n) {
if (pred(*(stencil + idx))) {
Expand Down Expand Up @@ -129,8 +129,8 @@ template <int32_t CGSize,
__global__ void insert_if_n(
InputIterator first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref)
{
cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize;
cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize;
auto const loop_stride = cuco::detail::grid_stride() / CGSize;
auto idx = cuco::detail::global_thread_id() / CGSize;

while (idx < n) {
if (pred(*(stencil + idx))) {
Expand Down Expand Up @@ -188,11 +188,10 @@ __global__ void contains_if_n(InputIt first,
{
namespace cg = cooperative_groups;

auto const block = cg::this_thread_block();
auto const thread_idx = block.thread_rank();

cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize;
cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize;
auto const block = cg::this_thread_block();
auto const thread_idx = block.thread_rank();
auto const loop_stride = cuco::detail::grid_stride() / CGSize;
auto idx = cuco::detail::global_thread_id() / CGSize;

__shared__ bool output_buffer[BlockSize / CGSize];

Expand Down Expand Up @@ -239,8 +238,8 @@ __global__ void size(StorageRef storage, Predicate is_filled, AtomicT* count)
{
using size_type = typename StorageRef::size_type;

cuco::detail::index_type const loop_stride = gridDim.x * BlockSize;
cuco::detail::index_type idx = BlockSize * blockIdx.x + threadIdx.x;
auto const loop_stride = cuco::detail::grid_stride();
auto idx = cuco::detail::global_thread_id();

size_type thread_count = 0;
auto const n = storage.num_windows();
Expand Down
5 changes: 3 additions & 2 deletions include/cuco/detail/extent/extent.inl
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <cuco/detail/error.hpp>
#include <cuco/detail/prime.hpp> // TODO move to detail/extent/
#include <cuco/detail/utility/math.hpp>
#include <cuco/detail/utils.hpp>
#include <cuco/utility/fast_int.cuh>

Expand Down Expand Up @@ -80,8 +81,8 @@ template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N>
(static_cast<uint64_t>(std::numeric_limits<SizeType>::max()) < max_prime)
? std::numeric_limits<SizeType>::max()
: static_cast<SizeType>(max_prime);
auto const size =
SDIV(std::max(static_cast<SizeType>(ext), static_cast<SizeType>(1)), CGSize * WindowSize);
auto const size = cuco::detail::int_div_ceil(
std::max(static_cast<SizeType>(ext), static_cast<SizeType>(1)), CGSize * WindowSize);
if (size > max_value) { CUCO_FAIL("Invalid input extent"); }

if constexpr (N == dynamic_extent) {
Expand Down
66 changes: 26 additions & 40 deletions include/cuco/detail/open_addressing_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#include <cuco/detail/common_functors.cuh>
#include <cuco/detail/common_kernels.cuh>
#include <cuco/detail/storage/counter_storage.cuh>
#include <cuco/detail/tuning.cuh>
#include <cuco/detail/utility/cuda.hpp>
#include <cuco/extent.cuh>
#include <cuco/probing_scheme.cuh>
#include <cuco/storage.cuh>
Expand Down Expand Up @@ -187,20 +187,18 @@ class open_addressing_impl {
detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator()};
counter.reset(stream);

auto const grid_size =
(cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) /
(detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE);
auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

auto const always_true = thrust::constant_iterator<bool>{true};
detail::insert_if_n<cg_size, detail::CUCO_DEFAULT_BLOCK_SIZE>
<<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(
detail::insert_if_n<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
first, num_keys, always_true, thrust::identity{}, counter.data(), container_ref);

return counter.load_to_host(stream);
}

/**
* @brief Asynchonously inserts all keys in the range `[first, last)`.
* @brief Asynchronously inserts all keys in the range `[first, last)`.
*
* @tparam InputIt Device accessible random access input iterator where
* <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
Expand All @@ -218,13 +216,11 @@ class open_addressing_impl {
auto const num_keys = cuco::detail::distance(first, last);
if (num_keys == 0) { return; }

auto const grid_size =
(cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) /
(detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE);
auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

auto const always_true = thrust::constant_iterator<bool>{true};
detail::insert_if_n<cg_size, detail::CUCO_DEFAULT_BLOCK_SIZE>
<<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(
detail::insert_if_n<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
first, num_keys, always_true, thrust::identity{}, container_ref);
}

Expand Down Expand Up @@ -269,19 +265,17 @@ class open_addressing_impl {
detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator()};
counter.reset(stream);

auto const grid_size =
(cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) /
(detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE);
auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

detail::insert_if_n<cg_size, detail::CUCO_DEFAULT_BLOCK_SIZE>
<<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(
detail::insert_if_n<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
first, num_keys, stencil, pred, counter.data(), container_ref);

return counter.load_to_host(stream);
}

/**
* @brief Asynchonously inserts keys in the range `[first, last)` if `pred` of the corresponding
* @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding
* stencil returns true.
*
* @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true.
Expand Down Expand Up @@ -313,17 +307,15 @@ class open_addressing_impl {
auto const num_keys = cuco::detail::distance(first, last);
if (num_keys == 0) { return; }

auto const grid_size =
(cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) /
(detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE);
auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

detail::insert_if_n<cg_size, detail::CUCO_DEFAULT_BLOCK_SIZE>
<<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(
detail::insert_if_n<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
first, num_keys, stencil, pred, container_ref);
}

/**
* @brief Asynchonously indicates whether the keys in the range `[first, last)` are contained in
* @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in
* the container.
*
* @tparam InputIt Device accessible input iterator
Expand All @@ -346,18 +338,16 @@ class open_addressing_impl {
auto const num_keys = cuco::detail::distance(first, last);
if (num_keys == 0) { return; }

auto const grid_size =
(cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) /
(detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE);
auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

auto const always_true = thrust::constant_iterator<bool>{true};
detail::contains_if_n<cg_size, detail::CUCO_DEFAULT_BLOCK_SIZE>
<<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(
detail::contains_if_n<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
first, num_keys, always_true, thrust::identity{}, output_begin, container_ref);
}

/**
* @brief Asynchonously indicates whether the keys in the range `[first, last)` are contained in
* @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in
* the container if `pred` of the corresponding stencil returns true.
*
* @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)`
Expand Down Expand Up @@ -397,12 +387,10 @@ class open_addressing_impl {
auto const num_keys = cuco::detail::distance(first, last);
if (num_keys == 0) { return; }

auto const grid_size =
(cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) /
(detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE);
auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

detail::contains_if_n<cg_size, detail::CUCO_DEFAULT_BLOCK_SIZE>
<<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(
detail::contains_if_n<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
first, num_keys, stencil, pred, output_begin, container_ref);
}

Expand Down Expand Up @@ -489,14 +477,12 @@ class open_addressing_impl {
detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator()};
counter.reset(stream);

auto const grid_size =
(storage_.num_windows() + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) /
(detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE);
auto const grid_size = cuco::detail::grid_size(storage_.num_windows());

// TODO: custom kernel to be replaced by cub::DeviceReduce::Sum when cub version is bumped to
// v2.1.0
detail::size<detail::CUCO_DEFAULT_BLOCK_SIZE>
<<<grid_size, detail::CUCO_DEFAULT_BLOCK_SIZE, 0, stream>>>(
detail::size<cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
storage_.ref(), is_filled, counter.data());

return counter.load_to_host(stream);
Expand Down
Loading

0 comments on commit cedc5d4

Please sign in to comment.