diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 00d3f64..dcb3241 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -10,6 +10,7 @@ set(example_targets bcht_example custom_types_example iht_example + hash_join ) foreach(target ${example_targets}) diff --git a/examples/hash_join.cu b/examples/hash_join.cu new file mode 100644 index 0000000..7626d6e --- /dev/null +++ b/examples/hash_join.cu @@ -0,0 +1,130 @@ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +template +__global__ void join(HashMap read_from, HashMap query_into, HashMap result_table) { + using pair_type = typename HashMap::value_type; + using key_type = typename HashMap::key_type; + using value_type = typename HashMap::mapped_type; + + auto block = cooperative_groups::this_thread_block(); + auto tile = cooperative_groups::tiled_partition(block); + + using queue_type = bght::tile_wide_queue; + + auto thread_id = threadIdx.x + blockIdx.x * blockDim.x; + + const auto capacity = read_from.max_size(); + const auto begin = read_from.begin(); + + const auto sentinel_pair = read_from.get_sentinel_pair(); + + const auto pair = thread_id < capacity + ? (begin + thread_id)->load(cuda::memory_order_relaxed) + : sentinel_pair; + + queue_type work_queue(pair, sentinel_pair, tile); + + while (!work_queue.empty()) { + auto cur_pair = work_queue.front(); + work_queue.pop(); + + auto value = query_into.find(cur_pair.first, tile); + // if the key exist + if (value != sentinel_pair.second) { + // store into the result table the sum of the two values + pair_type new_pair{cur_pair.first, cur_pair.second + value}; + result_table.insert(new_pair, tile); + } + } +} + +template +__global__ void print(HashMap result_table) { + using pair_type = typename HashMap::value_type; + using key_type = typename HashMap::key_type; + using value_type = typename HashMap::mapped_type; + + const auto capacity = result_table.max_size(); + const auto sentinel_key = result_table.get_sentinel_key(); + const auto sentinel_pair = result_table.get_sentinel_pair(); + + auto begin = result_table.begin(); + + auto thread_id = threadIdx.x + blockIdx.x * blockDim.x; + + const auto pair = thread_id < capacity + ? (begin + thread_id)->load(cuda::memory_order_relaxed) + : sentinel_pair; + if (pair.first != sentinel_key) { + printf("result_map[%u] = %u, exepcted %u\n", + pair.first, + pair.second, + pair.first * 2 * 10); + }; +} +int main(int, char**) { + using key_type = uint32_t; + using value_type = uint32_t; + using pair_type = bght::pair; + + std::vector read_from_input_h = { + {1, 10}, {2, 20}, {3, 30}, {4, 40}, {5, 50}}; + std::vector query_into_input_h = { + {3, 30}, {4, 40}, {5, 50}, {6, 60}, {7, 70}}; + + auto invalid_key = std::numeric_limits::max(); + auto invalid_value = std::numeric_limits::max(); + + const float load_factor = 0.7; + + const auto num_read_from_keys = read_from_input_h.size(); + const auto num_query_into_keys = query_into_input_h.size(); + + std::size_t read_from_capacity = double(num_read_from_keys) / load_factor; + std::size_t query_into_capacity = double(num_query_into_keys) / load_factor; + + using hash_map = bght::iht; + + hash_map read_from_map(read_from_capacity, invalid_key, invalid_value); + hash_map query_into_map(query_into_capacity, invalid_key, invalid_value); + + thrust::device_vector read_from_input(read_from_input_h); + thrust::device_vector query_into_input(query_into_input_h); + + // do the insertions which can execute concurrently, but here we use the default stream + bool success = read_from_map.insert(read_from_input.data().get(), + read_from_input.data().get() + num_read_from_keys); + if (!success) { + std::cerr << "Failed to build first map" << std::endl; + } + success = query_into_map.insert(query_into_input.data().get(), + query_into_input.data().get() + query_into_capacity); + if (!success) { + std::cerr << "Failed to build second map" << std::endl; + } + + // build the result + const auto worst_case_capacity = std::min(query_into_capacity, read_from_capacity); + hash_map result_map(worst_case_capacity, invalid_key, invalid_value); + + const uint32_t block_size = 128; + uint32_t num_blocks = (num_read_from_keys + block_size - 1) / block_size; + join<<>>(read_from_map, query_into_map, result_map); + + cuda_try(cudaDeviceSynchronize()); + + std::cout << "Join complete" << std::endl; + + num_blocks = (worst_case_capacity + block_size - 1) / block_size; + print<<>>(result_map); +} \ No newline at end of file diff --git a/include/detail/iht_impl.cuh b/include/detail/iht_impl.cuh index 0fa0be4..64cd353 100644 --- a/include/detail/iht_impl.cuh +++ b/include/detail/iht_impl.cuh @@ -347,7 +347,8 @@ template typename iht::const_iterator -iht::begin() const { + __device__ __host__ + iht::begin() const { return d_table_; } @@ -360,7 +361,8 @@ template typename iht::const_iterator -iht::end() const { + __device__ __host__ + iht::end() const { return d_table_ + capacity_; } @@ -372,8 +374,9 @@ template -typename iht::size_type -iht::max_size() const { +__device__ __host__ + typename iht::size_type + iht::max_size() const { return capacity_; } } // namespace bght diff --git a/include/detail/pair.cuh b/include/detail/pair.cuh index 57cfe07..4434b47 100644 --- a/include/detail/pair.cuh +++ b/include/detail/pair.cuh @@ -32,10 +32,10 @@ struct alignas(detail::pair_alignment()) padded_pair { padded_pair& operator=(padded_pair const&) = default; padded_pair& operator=(padded_pair&&) = default; - __host__ __device__ inline bool operator==(const padded_pair& rhs) { + __host__ __device__ inline bool operator==(const padded_pair& rhs) const { return (this->first == rhs.first) && (this->second == rhs.second); } - __host__ __device__ inline bool operator!=(const padded_pair& rhs) { + __host__ __device__ inline bool operator!=(const padded_pair& rhs) const { return !(*this == rhs); } @@ -57,10 +57,10 @@ struct alignas(detail::pair_alignment()) padded_pair { padded_pair& operator=(padded_pair const&) = default; padded_pair& operator=(padded_pair&&) = default; - __host__ __device__ inline bool operator==(const padded_pair& rhs) { + __host__ __device__ inline bool operator==(const padded_pair& rhs) const { return (this->first == rhs.first) && (this->second == rhs.second); } - __host__ __device__ inline bool operator!=(const padded_pair& rhs) { + __host__ __device__ inline bool operator!=(const padded_pair& rhs) const { return !(*this == rhs); } diff --git a/include/iht.hpp b/include/iht.hpp index e2c196d..ec0d948 100644 --- a/include/iht.hpp +++ b/include/iht.hpp @@ -71,7 +71,7 @@ struct iht { static constexpr auto bucket_size = B; using key_equal = KeyEqual; - using iterator = atomic_pair_type&; + using iterator = atomic_pair_type*; using const_iterator = iterator; /** @@ -194,21 +194,44 @@ struct iht { * * @return const_iterator constant iterator to the first element of the table */ - const_iterator begin() const; + __device__ __host__ const_iterator begin() const; /** * @brief Returns an iterator to the last element of the tables including all invalid * entries. * * @return const_iterator constant iterator to the last element of the table */ - const_iterator end() const; + __device__ __host__ const_iterator end() const; /** * @brief Returns the maximum number of elements the container is able to hold * * @return size_type maximum number of elements including all invalid entries. */ - size_type max_size() const; + __device__ __host__ size_type max_size() const; + + /** + * @brief Get the sentinel key object + * + * @return key_type Sentinel key + */ + __device__ __host__ key_type get_sentinel_key() const { return sentinel_key_; } + + /** + * @brief Get the sentinel value object + * + * @return mapped_type Sentinel value + */ + __device__ __host__ mapped_type get_sentinel_value() const { return sentinel_value_; } + + /** + * @brief Get the sentinel pair object + * + * @return value_type Sentinel pair + */ + __device__ __host__ value_type get_sentinel_pair() const { + return {get_sentinel_key(), get_sentinel_value()}; + } private: template diff --git a/include/tile_wide_queue.hpp b/include/tile_wide_queue.hpp new file mode 100644 index 0000000..1c27d64 --- /dev/null +++ b/include/tile_wide_queue.hpp @@ -0,0 +1,65 @@ +/* + * Copyright 2024 The Regents of the University of California, Davis + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +namespace bght { +template +struct tile_wide_queue { + using value_type = T; + using reference = value_type&; + using const_reference = const value_type&; + using size_type = uint32_t; + + __device__ tile_wide_queue(const T& element, const T& sentinel, const CG& group) + : element_{element}, cg_{group} { + active_lane_ = (element_ != sentinel); + build(); + } + __device__ tile_wide_queue(const T& element, const bool& valid, const CG& group) + : element_{element}, cg_{group} { + active_lane_ = valid; + build(); + } + + __device__ value_type front() { + auto item = cg_.shfl(element_, cur_); + return item; + } + __device__ void pop() { + if (!empty()) { + if (cg_.thread_rank() == cur_) { + active_lane_ = false; + } + build(); + } + } + __device__ [[nodiscard]] bool empty() const { return mask_ == 0; } + __device__ size_type size() const { return __popc(mask_); } + + private: + __device__ void build() { + mask_ = cg_.ballot(active_lane_); + cur_ = __ffs(mask_) - 1; + } + const T& element_; + const CG cg_; + bool active_lane_; + uint32_t mask_; + uint32_t cur_; +}; +} // namespace bght