From b2e55881f40b1fb1e422b095e68f28b5d148ae1a Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 30 May 2024 08:58:10 +0530 Subject: [PATCH] [SYCLomatic][NFC] refactor code by creating a separate util file for groups (#1784) Signed-off-by: Wang, Yihan Co-authored-by: Yihan Wang Co-authored-by: Dan Hoeflinger <109972525+danhoeflinger@users.noreply.github.com> --- clang/lib/DPCT/CMakeLists.txt | 2 + clang/lib/DPCT/GenHelperFunction.cpp | 4 + clang/lib/DPCT/GenHelperFunction.h | 1 + clang/lib/DPCT/HeaderTypes.inc | 1 + clang/runtime/dpct-rt/CMakeLists.txt | 1 + .../dpct/dpl_extras/dpcpp_extensions.h | 514 ----------------- .../dpct-rt/include/dpct/dpl_utils.hpp | 1 + .../dpct-rt/include/dpct/group_utils.hpp | 533 ++++++++++++++++++ clang/test/dpct/check_header_files.cpp | 5 + 9 files changed, 548 insertions(+), 514 deletions(-) create mode 100644 clang/runtime/dpct-rt/include/dpct/group_utils.hpp diff --git a/clang/lib/DPCT/CMakeLists.txt b/clang/lib/DPCT/CMakeLists.txt index 8a827847b90c..f168bf409e53 100644 --- a/clang/lib/DPCT/CMakeLists.txt +++ b/clang/lib/DPCT/CMakeLists.txt @@ -22,6 +22,7 @@ set(RUNTIME_HEADERS ${CMAKE_SOURCE_DIR}/../clang/runtime/dpct-rt/include/dpct/sparse_utils.hpp ${CMAKE_SOURCE_DIR}/../clang/runtime/dpct-rt/include/dpct/fft_utils.hpp ${CMAKE_SOURCE_DIR}/../clang/runtime/dpct-rt/include/dpct/lapack_utils.hpp + ${CMAKE_SOURCE_DIR}/../clang/runtime/dpct-rt/include/dpct/group_utils.hpp ${CMAKE_SOURCE_DIR}/../clang/runtime/dpct-rt/include/dpct/dpl_extras/algorithm.h ${CMAKE_SOURCE_DIR}/../clang/runtime/dpct-rt/include/dpct/dpl_extras/functional.h ${CMAKE_SOURCE_DIR}/../clang/runtime/dpct-rt/include/dpct/dpl_extras/iterators.h @@ -58,6 +59,7 @@ set(PROCESS_FILES_OUTPUT ${CMAKE_BINARY_DIR}/tools/clang/include/clang/DPCT/sparse_utils.hpp.inc ${CMAKE_BINARY_DIR}/tools/clang/include/clang/DPCT/fft_utils.hpp.inc ${CMAKE_BINARY_DIR}/tools/clang/include/clang/DPCT/lapack_utils.hpp.inc + ${CMAKE_BINARY_DIR}/tools/clang/include/clang/DPCT/group_utils.hpp.inc ) add_custom_command( diff --git a/clang/lib/DPCT/GenHelperFunction.cpp b/clang/lib/DPCT/GenHelperFunction.cpp index 32ca482a1d12..688252a56164 100644 --- a/clang/lib/DPCT/GenHelperFunction.cpp +++ b/clang/lib/DPCT/GenHelperFunction.cpp @@ -73,6 +73,9 @@ const std::string FftUtilsAllContentStr = const std::string LapackUtilsAllContentStr = #include "clang/DPCT/lapack_utils.hpp.inc" ; +const std::string GroupUtilsAllContentStr = +#include "clang/DPCT/group_utils.hpp.inc" + ; const std::string DplExtrasAlgorithmAllContentStr = #include "clang/DPCT/dpl_extras/algorithm.h.inc" ; @@ -162,6 +165,7 @@ void genHelperFunction(const clang::tooling::UnifiedPath &OutRoot) { GENERATE_ALL_FILE_CONTENT(SparseUtils, ".", sparse_utils.hpp) GENERATE_ALL_FILE_CONTENT(FftUtils, ".", fft_utils.hpp) GENERATE_ALL_FILE_CONTENT(LapackUtils, ".", lapack_utils.hpp) + GENERATE_ALL_FILE_CONTENT(GroupUtils, ".", group_utils.hpp) GENERATE_ALL_FILE_CONTENT(CodePin, "codepin", codepin.hpp) GENERATE_ALL_FILE_CONTENT(CodePinSerializationBasic, "codepin/serialization", basic.hpp) diff --git a/clang/lib/DPCT/GenHelperFunction.h b/clang/lib/DPCT/GenHelperFunction.h index 6f3718695cfb..d0262848172d 100644 --- a/clang/lib/DPCT/GenHelperFunction.h +++ b/clang/lib/DPCT/GenHelperFunction.h @@ -32,6 +32,7 @@ extern const std::string CclUtilsAllContentStr; extern const std::string SparseUtilsAllContentStr; extern const std::string FftUtilsAllContentStr; extern const std::string LapackUtilsAllContentStr; +extern const std::string GroupUtilsAllContentStr; extern const std::string DplExtrasAlgorithmAllContentStr; extern const std::string DplExtrasFunctionalAllContentStr; extern const std::string DplExtrasIteratorsAllContentStr; diff --git a/clang/lib/DPCT/HeaderTypes.inc b/clang/lib/DPCT/HeaderTypes.inc index 00050957ff81..e62256cc320d 100644 --- a/clang/lib/DPCT/HeaderTypes.inc +++ b/clang/lib/DPCT/HeaderTypes.inc @@ -65,6 +65,7 @@ DPCT_HEADER(CCL_Utils, "") DPCT_HEADER(BLAS_Utils, "") DPCT_HEADER(DNNL_Utils, "") DPCT_HEADER(LAPACK_Utils, "") +DPCT_HEADER(GROUP_Utils, "") DPCT_HEADER(COMMON_Utils, "") DPCT_HEADER(Atomic, "") DPCT_HEADER(SPBLAS_Utils, "") diff --git a/clang/runtime/dpct-rt/CMakeLists.txt b/clang/runtime/dpct-rt/CMakeLists.txt index 63e9b630d030..50bfff5f055a 100644 --- a/clang/runtime/dpct-rt/CMakeLists.txt +++ b/clang/runtime/dpct-rt/CMakeLists.txt @@ -17,6 +17,7 @@ set(dpct_rt_files include/dpct/sparse_utils.hpp include/dpct/fft_utils.hpp include/dpct/lapack_utils.hpp + include/dpct/group_utils.hpp ) set(dpct_rt_dpstd_files diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 15b0b1102290..d8472c5c275e 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -149,520 +149,6 @@ exclusive_scan(const Item &item, T input, BinaryOperation binary_op, return output; } -namespace detail { - -typedef uint16_t digit_counter_type; -typedef uint32_t packed_counter_type; - -template struct log2 { - enum { VALUE = log2> 1), COUNT + 1>::VALUE }; -}; - -template struct log2 { - enum { VALUE = (1 << (COUNT - 1) < N) ? COUNT : COUNT - 1 }; -}; - -template class radix_rank { -public: - static size_t get_local_memory_size(size_t group_threads) { - return group_threads * PADDED_COUNTER_LANES * sizeof(packed_counter_type); - } - - radix_rank(uint8_t *local_memory) : _local_memory(local_memory) {} - - template - __dpct_inline__ void - rank_keys(const Item &item, uint32_t (&keys)[VALUES_PER_THREAD], - int (&ranks)[VALUES_PER_THREAD], int current_bit, int num_bits) { - - digit_counter_type thread_prefixes[VALUES_PER_THREAD]; - digit_counter_type *digit_counters[VALUES_PER_THREAD]; - digit_counter_type *buffer = - reinterpret_cast(_local_memory); - - reset_local_memory(item); - - item.barrier(sycl::access::fence_space::local_space); - -#pragma unroll - for (int i = 0; i < VALUES_PER_THREAD; ++i) { - uint32_t digit = ::dpct::bfe(keys[i], current_bit, num_bits); - uint32_t sub_counter = digit >> LOG_COUNTER_LANES; - uint32_t counter_lane = digit & (COUNTER_LANES - 1); - - if (DESCENDING) { - sub_counter = PACKING_RATIO - 1 - sub_counter; - counter_lane = COUNTER_LANES - 1 - counter_lane; - } - - digit_counters[i] = - &buffer[counter_lane * item.get_local_range().size() * PACKING_RATIO + - item.get_local_linear_id() * PACKING_RATIO + sub_counter]; - thread_prefixes[i] = *digit_counters[i]; - *digit_counters[i] = thread_prefixes[i] + 1; - } - - item.barrier(sycl::access::fence_space::local_space); - - scan_counters(item); - - item.barrier(sycl::access::fence_space::local_space); - - for (int i = 0; i < VALUES_PER_THREAD; ++i) { - ranks[i] = thread_prefixes[i] + *digit_counters[i]; - } - } - -private: - template - __dpct_inline__ void reset_local_memory(const Item &item) { - packed_counter_type *ptr = - reinterpret_cast(_local_memory); - -#pragma unroll - for (int i = 0; i < PADDED_COUNTER_LANES; ++i) { - ptr[i * item.get_local_range().size() + item.get_local_linear_id()] = 0; - } - } - - template - __dpct_inline__ packed_counter_type upsweep(const Item &item) { - packed_counter_type sum = 0; - packed_counter_type *ptr = - reinterpret_cast(_local_memory); - -#pragma unroll - for (int i = 0; i < PADDED_COUNTER_LANES; i++) { - cached_segment[i] = - ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i]; - } - -#pragma unroll - for (int i = 0; i < PADDED_COUNTER_LANES; ++i) { - sum += cached_segment[i]; - } - - return sum; - } - - template - __dpct_inline__ void - exclusive_downsweep(const Item &item, packed_counter_type raking_partial) { - packed_counter_type *ptr = - reinterpret_cast(_local_memory); - packed_counter_type sum = raking_partial; - -#pragma unroll - for (int i = 0; i < PADDED_COUNTER_LANES; ++i) { - packed_counter_type value = cached_segment[i]; - cached_segment[i] = sum; - sum += value; - } - -#pragma unroll - for (int i = 0; i < PADDED_COUNTER_LANES; ++i) { - ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i] = - cached_segment[i]; - } - } - - struct prefix_callback { - __dpct_inline__ packed_counter_type - operator()(packed_counter_type block_aggregate) { - packed_counter_type block_prefix = 0; - -#pragma unroll - for (int packed = 1; packed < PACKING_RATIO; packed++) { - block_prefix += block_aggregate - << (sizeof(digit_counter_type) * 8 * packed); - } - - return block_prefix; - } - }; - - template - __dpct_inline__ void scan_counters(const Item &item) { - packed_counter_type raking_partial = upsweep(item); - - prefix_callback callback; - packed_counter_type exclusive_partial = exclusive_scan( - item, raking_partial, sycl::ext::oneapi::plus(), - callback); - - exclusive_downsweep(item, exclusive_partial); - } - -private: - static constexpr int PACKING_RATIO = - sizeof(packed_counter_type) / sizeof(digit_counter_type); - static constexpr int LOG_PACKING_RATIO = log2::VALUE; - static constexpr int LOG_COUNTER_LANES = RADIX_BITS - LOG_PACKING_RATIO; - static constexpr int COUNTER_LANES = 1 << LOG_COUNTER_LANES; - static constexpr int PADDED_COUNTER_LANES = COUNTER_LANES + 1; - - packed_counter_type cached_segment[PADDED_COUNTER_LANES]; - uint8_t *_local_memory; -}; - -template struct base_traits { - - static __dpct_inline__ U twiddle_in(U key) { - throw std::runtime_error("Not implemented"); - } - static __dpct_inline__ U twiddle_out(U key) { - throw std::runtime_error("Not implemented"); - } -}; - -template struct base_traits { - static __dpct_inline__ U twiddle_in(U key) { return key; } - static __dpct_inline__ U twiddle_out(U key) { return key; } -}; - -template struct base_traits { - static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1); - static __dpct_inline__ U twiddle_in(U key) { return key ^ HIGH_BIT; } - static __dpct_inline__ U twiddle_out(U key) { return key ^ HIGH_BIT; } -}; - -template struct base_traits { - static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1); - static __dpct_inline__ U twiddle_in(U key) { - U mask = (key & HIGH_BIT) ? U(-1) : HIGH_BIT; - return key ^ mask; - } - static __dpct_inline__ U twiddle_out(U key) { - U mask = (key & HIGH_BIT) ? HIGH_BIT : U(-1); - return key ^ mask; - } -}; - -template struct traits : base_traits {}; -template <> struct traits : base_traits {}; -template <> struct traits : base_traits {}; -template <> struct traits : base_traits {}; - -} // namespace detail - -namespace detail { - -template struct power_of_two { - enum { VALUE = ((N & (N - 1)) == 0) }; -}; - -__dpct_inline__ uint32_t shr_add(uint32_t x, uint32_t shift, uint32_t addend) { - return (x >> shift) + addend; -} - -} // namespace detail - -/// Implements scatter to blocked exchange pattern used in radix sort algorithm. -/// -/// \tparam T type of the data elements exchanges -/// \tparam VALUES_PER_THREAD number of data elements assigned to a thread -/// Implements blocked to striped exchange pattern -template class exchange { -public: - static size_t get_local_memory_size(size_t group_threads) { - size_t padding_values = - (INSERT_PADDING) - ? ((group_threads * VALUES_PER_THREAD) >> LOG_LOCAL_MEMORY_BANKS) - : 0; - return (group_threads * VALUES_PER_THREAD + padding_values) * sizeof(T); - } - - exchange(uint8_t *local_memory) : _local_memory(local_memory) {} - - // TODO: Investigate if padding is required for performance, - // and if specializations are required for specific target hardware. - static size_t adjust_by_padding(size_t offset) { - - if constexpr (INSERT_PADDING) { - offset = detail::shr_add(offset, LOG_LOCAL_MEMORY_BANKS, offset); - } - return offset; - } - - struct blocked_offset { - template size_t operator()(Item item, size_t i) { - size_t offset = item.get_local_linear_id() * VALUES_PER_THREAD + i; - return adjust_by_padding(offset); - } - }; - - struct striped_offset { - template size_t operator()(Item item, size_t i) { - size_t offset = i * item.get_local_range(2) * item.get_local_range(1) * - item.get_local_range(0) + - item.get_local_linear_id(); - return adjust_by_padding(offset); - } - }; - - template struct scatter_offset { - Iterator begin; - scatter_offset(const int (&ranks)[VALUES_PER_THREAD]) { - begin = std::begin(ranks); - } - template size_t operator()(Item item, size_t i) const { - // iterator i is expected to be within bounds [0,VALUES_PER_THREAD) - return adjust_by_padding(begin[i]); - } - }; - - template - __dpct_inline__ void helper_exchange(Item item, T (&keys)[VALUES_PER_THREAD], - offsetFunctorTypeFW &offset_functor_fw, - offsetFunctorTypeRV &offset_functor_rv) { - - T *buffer = reinterpret_cast(_local_memory); - -#pragma unroll - for (size_t i = 0; i < VALUES_PER_THREAD; i++) { - size_t offset = offset_functor_fw(item, i); - buffer[offset] = keys[i]; - } - - item.barrier(sycl::access::fence_space::local_space); - -#pragma unroll - for (size_t i = 0; i < VALUES_PER_THREAD; i++) { - size_t offset = offset_functor_rv(item, i); - keys[i] = buffer[offset]; - } - } - - /// Rearrange elements from blocked order to striped order - template - __dpct_inline__ void blocked_to_striped(Item item, - T (&keys)[VALUES_PER_THREAD]) { - - striped_offset get_striped_offset; - blocked_offset get_blocked_offset; - helper_exchange(item, keys, get_blocked_offset, get_striped_offset); - } - - /// Rearrange elements from striped order to blocked order - template - __dpct_inline__ void striped_to_blocked(Item item, - T (&keys)[VALUES_PER_THREAD]) { - - blocked_offset get_blocked_offset; - striped_offset get_striped_offset; - helper_exchange(item, keys, get_striped_offset, get_blocked_offset); - } - - /// Rearrange elements from rank order to blocked order - template - __dpct_inline__ void scatter_to_blocked(Item item, - T (&keys)[VALUES_PER_THREAD], - int (&ranks)[VALUES_PER_THREAD]) { - - scatter_offset get_scatter_offset(ranks); - blocked_offset get_blocked_offset; - helper_exchange(item, keys, get_scatter_offset, get_blocked_offset); - } - - /// Rearrange elements from scatter order to striped order - template - __dpct_inline__ void scatter_to_striped(Item item, - T (&keys)[VALUES_PER_THREAD], - int (&ranks)[VALUES_PER_THREAD]) { - - scatter_offset get_scatter_offset(ranks); - striped_offset get_striped_offset; - helper_exchange(item, keys, get_scatter_offset, get_striped_offset); - } - -private: - static constexpr int LOG_LOCAL_MEMORY_BANKS = 4; - static constexpr bool INSERT_PADDING = - (VALUES_PER_THREAD > 4) && - (detail::power_of_two::VALUE); - - uint8_t *_local_memory; -}; - -/// Implements radix sort to sort integer data elements assigned to all threads -/// in the group. -/// -/// \tparam T type of the data elements exchanges -/// \tparam VALUES_PER_THREAD number of data elements assigned to a thread -/// \tparam DECENDING boolean value indicating if data elements are sorted in -/// decending order. -template -class radix_sort { -public: - static size_t get_local_memory_size(size_t group_threads) { - size_t ranks_size = - detail::radix_rank::get_local_memory_size(group_threads); - size_t exchange_size = - exchange::get_local_memory_size(group_threads); - return sycl::max(ranks_size, exchange_size); - } - - radix_sort(uint8_t *local_memory) : _local_memory(local_memory) {} - - template - __dpct_inline__ void - helper_sort(const Item &item, T (&keys)[VALUES_PER_THREAD], int begin_bit = 0, - int end_bit = 8 * sizeof(T), bool is_striped = false) { - - uint32_t(&unsigned_keys)[VALUES_PER_THREAD] = - reinterpret_cast(keys); - -#pragma unroll - for (int i = 0; i < VALUES_PER_THREAD; ++i) { - unsigned_keys[i] = detail::traits::twiddle_in(unsigned_keys[i]); - } - - for (int i = begin_bit; i < end_bit; i += RADIX_BITS) { - int pass_bits = sycl::min(RADIX_BITS, end_bit - begin_bit); - - int ranks[VALUES_PER_THREAD]; - detail::radix_rank(_local_memory) - .template rank_keys(item, unsigned_keys, ranks, i, pass_bits); - - item.barrier(sycl::access::fence_space::local_space); - - bool last_iter = i + RADIX_BITS > end_bit; - if (last_iter && is_striped) { - exchange(_local_memory) - .scatter_to_striped(item, keys, ranks); - - } else { - exchange(_local_memory) - .scatter_to_blocked(item, keys, ranks); - } - - item.barrier(sycl::access::fence_space::local_space); - } - -#pragma unroll - for (int i = 0; i < VALUES_PER_THREAD; ++i) { - unsigned_keys[i] = detail::traits::twiddle_out(unsigned_keys[i]); - } - } - - template - __dpct_inline__ void - sort_blocked(const Item &item, T (&keys)[VALUES_PER_THREAD], - int begin_bit = 0, int end_bit = 8 * sizeof(T)) { - helper_sort(item, keys, begin_bit, end_bit, false); - } - - template - __dpct_inline__ void - sort_blocked_to_striped(const Item &item, T (&keys)[VALUES_PER_THREAD], - int begin_bit = 0, int end_bit = 8 * sizeof(T)) { - helper_sort(item, keys, begin_bit, end_bit, true); - } - -private: - static constexpr int RADIX_BITS = 4; - - uint8_t *_local_memory; -}; - -/// Load linear segment items into block format across threads -/// Helper for Block Load -enum load_algorithm { - - BLOCK_LOAD_DIRECT, - BLOCK_LOAD_STRIPED, - // To-do: BLOCK_LOAD_WARP_TRANSPOSE - -}; - -// loads a linear segment of workgroup items into a blocked arrangement. -template -__dpct_inline__ void load_blocked(const Item &item, InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]) { - - // This implementation does not take in account range loading across - // workgroup items To-do: Decide whether range loading is required for group - // loading - size_t linear_tid = item.get_local_linear_id(); - uint32_t workgroup_offset = linear_tid * ITEMS_PER_WORK_ITEM; -#pragma unroll - for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { - items[idx] = block_itr[workgroup_offset + idx]; - } -} - -// loads a linear segment of workgroup items into a striped arrangement. -template -__dpct_inline__ void load_striped(const Item &item, InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]) { - - // This implementation does not take in account range loading across - // workgroup items To-do: Decide whether range loading is required for group - // loading - size_t linear_tid = item.get_local_linear_id(); - size_t group_work_items = item.get_local_range().size(); -#pragma unroll - for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { - items[idx] = block_itr[linear_tid + (idx * group_work_items)]; - } -} - -// loads a linear segment of workgroup items into a subgroup striped -// arrangement. Created as free function until exchange mechanism is -// implemented. -// To-do: inline this function with BLOCK_LOAD_WARP_TRANSPOSE mechanism -template -__dpct_inline__ void -uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]) { - - // This implementation does not take in account range loading across - // workgroup items To-do: Decide whether range loading is required for group - // loading - // This implementation uses unintialized memory for loading linear segments - // into warp striped arrangement. - uint32_t subgroup_offset = item.get_sub_group().get_local_linear_id(); - uint32_t subgroup_size = item.get_sub_group().get_local_linear_range(); - uint32_t subgroup_idx = item.get_sub_group().get_group_linear_id(); - uint32_t initial_offset = - (subgroup_idx * ITEMS_PER_WORK_ITEM * subgroup_size) + subgroup_offset; -#pragma unroll - for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { - new (&items[idx]) InputT(block_itr[initial_offset + (idx * subgroup_size)]); - } -} -// template parameters : -// ITEMS_PER_WORK_ITEM: size_t variable controlling the number of items per -// thread/work_item -// ALGORITHM: load_algorithm variable controlling the type of load operation. -// InputT: type for input sequence. -// InputIteratorT: input iterator type -// Item : typename parameter resembling sycl::nd_item<3> . -template -class workgroup_load { -public: - static size_t get_local_memory_size(size_t group_work_items) { return 0; } - workgroup_load(uint8_t *local_memory) : _local_memory(local_memory) {} - - __dpct_inline__ void load(const Item &item, InputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]) { - - if constexpr (ALGORITHM == BLOCK_LOAD_DIRECT) { - load_blocked(item, block_itr, items); - } else if constexpr (ALGORITHM == BLOCK_LOAD_STRIPED) { - load_striped(item, block_itr, items); - } - } - -private: - uint8_t *_local_memory; -}; - /// Perform a reduction of the data elements assigned to all threads in the /// group. /// diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_utils.hpp b/clang/runtime/dpct-rt/include/dpct/dpl_utils.hpp index 1440b4250ece..efc5940c6665 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_utils.hpp +++ b/clang/runtime/dpct-rt/include/dpct/dpl_utils.hpp @@ -23,6 +23,7 @@ #include "dpl_extras/vector.h" #include "dpl_extras/dpcpp_extensions.h" +#include "group_utils.hpp" // Only include iterator adaptor (and therefore boost) if necessary #ifdef ITERATOR_ADAPTOR_REQUIRED #include "dpl_extras/iterator_adaptor.h" diff --git a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp new file mode 100644 index 000000000000..9f0be34ab87f --- /dev/null +++ b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp @@ -0,0 +1,533 @@ +//==---- group_utils.hpp ------------------*- C++ -*--------------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===------------------------------------------------------------------===// + +#ifndef __DPCT_GROUP_UTILS_HPP__ +#define __DPCT_GROUP_UTILS_HPP__ + +#include +#include + +#include "dpct.hpp" +#include "dpl_extras/functional.h" + +namespace dpct { +namespace group { + +namespace detail { + +typedef uint16_t digit_counter_type; +typedef uint32_t packed_counter_type; + +template struct log2 { + enum { VALUE = log2> 1), COUNT + 1>::VALUE }; +}; + +template struct log2 { + enum { VALUE = (1 << (COUNT - 1) < N) ? COUNT : COUNT - 1 }; +}; + +template class radix_rank { +public: + static size_t get_local_memory_size(size_t group_threads) { + return group_threads * PADDED_COUNTER_LANES * sizeof(packed_counter_type); + } + + radix_rank(uint8_t *local_memory) : _local_memory(local_memory) {} + + template + __dpct_inline__ void + rank_keys(const Item &item, uint32_t (&keys)[VALUES_PER_THREAD], + int (&ranks)[VALUES_PER_THREAD], int current_bit, int num_bits) { + + digit_counter_type thread_prefixes[VALUES_PER_THREAD]; + digit_counter_type *digit_counters[VALUES_PER_THREAD]; + digit_counter_type *buffer = + reinterpret_cast(_local_memory); + + reset_local_memory(item); + + item.barrier(sycl::access::fence_space::local_space); + +#pragma unroll + for (int i = 0; i < VALUES_PER_THREAD; ++i) { + uint32_t digit = ::dpct::bfe(keys[i], current_bit, num_bits); + uint32_t sub_counter = digit >> LOG_COUNTER_LANES; + uint32_t counter_lane = digit & (COUNTER_LANES - 1); + + if (DESCENDING) { + sub_counter = PACKING_RATIO - 1 - sub_counter; + counter_lane = COUNTER_LANES - 1 - counter_lane; + } + + digit_counters[i] = + &buffer[counter_lane * item.get_local_range().size() * PACKING_RATIO + + item.get_local_linear_id() * PACKING_RATIO + sub_counter]; + thread_prefixes[i] = *digit_counters[i]; + *digit_counters[i] = thread_prefixes[i] + 1; + } + + item.barrier(sycl::access::fence_space::local_space); + + scan_counters(item); + + item.barrier(sycl::access::fence_space::local_space); + + for (int i = 0; i < VALUES_PER_THREAD; ++i) { + ranks[i] = thread_prefixes[i] + *digit_counters[i]; + } + } + +private: + template + __dpct_inline__ void reset_local_memory(const Item &item) { + packed_counter_type *ptr = + reinterpret_cast(_local_memory); + +#pragma unroll + for (int i = 0; i < PADDED_COUNTER_LANES; ++i) { + ptr[i * item.get_local_range().size() + item.get_local_linear_id()] = 0; + } + } + + template + __dpct_inline__ packed_counter_type upsweep(const Item &item) { + packed_counter_type sum = 0; + packed_counter_type *ptr = + reinterpret_cast(_local_memory); + +#pragma unroll + for (int i = 0; i < PADDED_COUNTER_LANES; i++) { + cached_segment[i] = + ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i]; + } + +#pragma unroll + for (int i = 0; i < PADDED_COUNTER_LANES; ++i) { + sum += cached_segment[i]; + } + + return sum; + } + + template + __dpct_inline__ void exclusive_downsweep(const Item &item, + packed_counter_type raking_partial) { + packed_counter_type *ptr = + reinterpret_cast(_local_memory); + packed_counter_type sum = raking_partial; + +#pragma unroll + for (int i = 0; i < PADDED_COUNTER_LANES; ++i) { + packed_counter_type value = cached_segment[i]; + cached_segment[i] = sum; + sum += value; + } + +#pragma unroll + for (int i = 0; i < PADDED_COUNTER_LANES; ++i) { + ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i] = + cached_segment[i]; + } + } + + struct prefix_callback { + __dpct_inline__ packed_counter_type + operator()(packed_counter_type block_aggregate) { + packed_counter_type block_prefix = 0; + +#pragma unroll + for (int packed = 1; packed < PACKING_RATIO; packed++) { + block_prefix += block_aggregate + << (sizeof(digit_counter_type) * 8 * packed); + } + + return block_prefix; + } + }; + + template + __dpct_inline__ void scan_counters(const Item &item) { + packed_counter_type raking_partial = upsweep(item); + + prefix_callback callback; + packed_counter_type exclusive_partial = exclusive_scan( + item, raking_partial, sycl::ext::oneapi::plus(), + callback); + + exclusive_downsweep(item, exclusive_partial); + } + +private: + static constexpr int PACKING_RATIO = + sizeof(packed_counter_type) / sizeof(digit_counter_type); + static constexpr int LOG_PACKING_RATIO = log2::VALUE; + static constexpr int LOG_COUNTER_LANES = RADIX_BITS - LOG_PACKING_RATIO; + static constexpr int COUNTER_LANES = 1 << LOG_COUNTER_LANES; + static constexpr int PADDED_COUNTER_LANES = COUNTER_LANES + 1; + + packed_counter_type cached_segment[PADDED_COUNTER_LANES]; + uint8_t *_local_memory; +}; + +template struct base_traits { + + static __dpct_inline__ U twiddle_in(U key) { + throw std::runtime_error("Not implemented"); + } + static __dpct_inline__ U twiddle_out(U key) { + throw std::runtime_error("Not implemented"); + } +}; + +template struct base_traits { + static __dpct_inline__ U twiddle_in(U key) { return key; } + static __dpct_inline__ U twiddle_out(U key) { return key; } +}; + +template struct base_traits { + static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1); + static __dpct_inline__ U twiddle_in(U key) { return key ^ HIGH_BIT; } + static __dpct_inline__ U twiddle_out(U key) { return key ^ HIGH_BIT; } +}; + +template struct base_traits { + static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1); + static __dpct_inline__ U twiddle_in(U key) { + U mask = (key & HIGH_BIT) ? U(-1) : HIGH_BIT; + return key ^ mask; + } + static __dpct_inline__ U twiddle_out(U key) { + U mask = (key & HIGH_BIT) ? HIGH_BIT : U(-1); + return key ^ mask; + } +}; + +template struct traits : base_traits {}; +template <> struct traits : base_traits {}; +template <> struct traits : base_traits {}; +template <> struct traits : base_traits {}; + +template struct power_of_two { + enum { VALUE = ((N & (N - 1)) == 0) }; +}; + +__dpct_inline__ uint32_t shr_add(uint32_t x, uint32_t shift, uint32_t addend) { + return (x >> shift) + addend; +} + +} // namespace detail + +/// Implements scatter to blocked exchange pattern used in radix sort algorithm. +/// +/// \tparam T type of the data elements exchanges +/// \tparam VALUES_PER_THREAD number of data elements assigned to a thread +/// Implements blocked to striped exchange pattern +template class exchange { +public: + static size_t get_local_memory_size(size_t group_threads) { + size_t padding_values = + (INSERT_PADDING) + ? ((group_threads * VALUES_PER_THREAD) >> LOG_LOCAL_MEMORY_BANKS) + : 0; + return (group_threads * VALUES_PER_THREAD + padding_values) * sizeof(T); + } + + exchange(uint8_t *local_memory) : _local_memory(local_memory) {} + + // TODO: Investigate if padding is required for performance, + // and if specializations are required for specific target hardware. + static size_t adjust_by_padding(size_t offset) { + + if constexpr (INSERT_PADDING) { + offset = detail::shr_add(offset, LOG_LOCAL_MEMORY_BANKS, offset); + } + return offset; + } + + struct blocked_offset { + template size_t operator()(Item item, size_t i) { + size_t offset = item.get_local_linear_id() * VALUES_PER_THREAD + i; + return adjust_by_padding(offset); + } + }; + + struct striped_offset { + template size_t operator()(Item item, size_t i) { + size_t offset = i * item.get_local_range(2) * item.get_local_range(1) * + item.get_local_range(0) + + item.get_local_linear_id(); + return adjust_by_padding(offset); + } + }; + + template struct scatter_offset { + Iterator begin; + scatter_offset(const int (&ranks)[VALUES_PER_THREAD]) { + begin = std::begin(ranks); + } + template size_t operator()(Item item, size_t i) const { + // iterator i is expected to be within bounds [0,VALUES_PER_THREAD) + return adjust_by_padding(begin[i]); + } + }; + + template + __dpct_inline__ void helper_exchange(Item item, T (&keys)[VALUES_PER_THREAD], + offsetFunctorTypeFW &offset_functor_fw, + offsetFunctorTypeRV &offset_functor_rv) { + + T *buffer = reinterpret_cast(_local_memory); + +#pragma unroll + for (size_t i = 0; i < VALUES_PER_THREAD; i++) { + size_t offset = offset_functor_fw(item, i); + buffer[offset] = keys[i]; + } + + item.barrier(sycl::access::fence_space::local_space); + +#pragma unroll + for (size_t i = 0; i < VALUES_PER_THREAD; i++) { + size_t offset = offset_functor_rv(item, i); + keys[i] = buffer[offset]; + } + } + + /// Rearrange elements from blocked order to striped order + template + __dpct_inline__ void blocked_to_striped(Item item, + T (&keys)[VALUES_PER_THREAD]) { + + striped_offset get_striped_offset; + blocked_offset get_blocked_offset; + helper_exchange(item, keys, get_blocked_offset, get_striped_offset); + } + + /// Rearrange elements from striped order to blocked order + template + __dpct_inline__ void striped_to_blocked(Item item, + T (&keys)[VALUES_PER_THREAD]) { + + blocked_offset get_blocked_offset; + striped_offset get_striped_offset; + helper_exchange(item, keys, get_striped_offset, get_blocked_offset); + } + + /// Rearrange elements from rank order to blocked order + template + __dpct_inline__ void scatter_to_blocked(Item item, + T (&keys)[VALUES_PER_THREAD], + int (&ranks)[VALUES_PER_THREAD]) { + + scatter_offset get_scatter_offset(ranks); + blocked_offset get_blocked_offset; + helper_exchange(item, keys, get_scatter_offset, get_blocked_offset); + } + + /// Rearrange elements from scatter order to striped order + template + __dpct_inline__ void scatter_to_striped(Item item, + T (&keys)[VALUES_PER_THREAD], + int (&ranks)[VALUES_PER_THREAD]) { + + scatter_offset get_scatter_offset(ranks); + striped_offset get_striped_offset; + helper_exchange(item, keys, get_scatter_offset, get_striped_offset); + } + +private: + static constexpr int LOG_LOCAL_MEMORY_BANKS = 4; + static constexpr bool INSERT_PADDING = + (VALUES_PER_THREAD > 4) && + (detail::power_of_two::VALUE); + + uint8_t *_local_memory; +}; + +/// Implements radix sort to sort integer data elements assigned to all threads +/// in the group. +/// +/// \tparam T type of the data elements exchanges +/// \tparam VALUES_PER_THREAD number of data elements assigned to a thread +/// \tparam DECENDING boolean value indicating if data elements are sorted in +/// decending order. +template +class radix_sort { +public: + static size_t get_local_memory_size(size_t group_threads) { + size_t ranks_size = + detail::radix_rank::get_local_memory_size(group_threads); + size_t exchange_size = + exchange::get_local_memory_size(group_threads); + return sycl::max(ranks_size, exchange_size); + } + + radix_sort(uint8_t *local_memory) : _local_memory(local_memory) {} + + template + __dpct_inline__ void + helper_sort(const Item &item, T (&keys)[VALUES_PER_THREAD], int begin_bit = 0, + int end_bit = 8 * sizeof(T), bool is_striped = false) { + + uint32_t(&unsigned_keys)[VALUES_PER_THREAD] = + reinterpret_cast(keys); + +#pragma unroll + for (int i = 0; i < VALUES_PER_THREAD; ++i) { + unsigned_keys[i] = detail::traits::twiddle_in(unsigned_keys[i]); + } + + for (int i = begin_bit; i < end_bit; i += RADIX_BITS) { + int pass_bits = sycl::min(RADIX_BITS, end_bit - begin_bit); + + int ranks[VALUES_PER_THREAD]; + detail::radix_rank(_local_memory) + .template rank_keys(item, unsigned_keys, ranks, i, pass_bits); + + item.barrier(sycl::access::fence_space::local_space); + + bool last_iter = i + RADIX_BITS > end_bit; + if (last_iter && is_striped) { + exchange(_local_memory) + .scatter_to_striped(item, keys, ranks); + + } else { + exchange(_local_memory) + .scatter_to_blocked(item, keys, ranks); + } + + item.barrier(sycl::access::fence_space::local_space); + } + +#pragma unroll + for (int i = 0; i < VALUES_PER_THREAD; ++i) { + unsigned_keys[i] = detail::traits::twiddle_out(unsigned_keys[i]); + } + } + + template + __dpct_inline__ void + sort_blocked(const Item &item, T (&keys)[VALUES_PER_THREAD], + int begin_bit = 0, int end_bit = 8 * sizeof(T)) { + helper_sort(item, keys, begin_bit, end_bit, false); + } + + template + __dpct_inline__ void + sort_blocked_to_striped(const Item &item, T (&keys)[VALUES_PER_THREAD], + int begin_bit = 0, int end_bit = 8 * sizeof(T)) { + helper_sort(item, keys, begin_bit, end_bit, true); + } + +private: + static constexpr int RADIX_BITS = 4; + + uint8_t *_local_memory; +}; + +/// Load linear segment items into block format across threads +/// Helper for Block Load +enum load_algorithm { + + BLOCK_LOAD_DIRECT, + BLOCK_LOAD_STRIPED, + // To-do: BLOCK_LOAD_WARP_TRANSPOSE + +}; + +// loads a linear segment of workgroup items into a blocked arrangement. +template +__dpct_inline__ void load_blocked(const Item &item, InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_WORK_ITEM]) { + + // This implementation does not take in account range loading across + // workgroup items To-do: Decide whether range loading is required for group + // loading + size_t linear_tid = item.get_local_linear_id(); + uint32_t workgroup_offset = linear_tid * ITEMS_PER_WORK_ITEM; +#pragma unroll + for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + items[idx] = block_itr[workgroup_offset + idx]; + } +} + +// loads a linear segment of workgroup items into a striped arrangement. +template +__dpct_inline__ void load_striped(const Item &item, InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_WORK_ITEM]) { + + // This implementation does not take in account range loading across + // workgroup items To-do: Decide whether range loading is required for group + // loading + size_t linear_tid = item.get_local_linear_id(); + size_t group_work_items = item.get_local_range().size(); +#pragma unroll + for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + items[idx] = block_itr[linear_tid + (idx * group_work_items)]; + } +} + +// loads a linear segment of workgroup items into a subgroup striped +// arrangement. Created as free function until exchange mechanism is +// implemented. +// To-do: inline this function with BLOCK_LOAD_WARP_TRANSPOSE mechanism +template +__dpct_inline__ void +uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_WORK_ITEM]) { + + // This implementation does not take in account range loading across + // workgroup items To-do: Decide whether range loading is required for group + // loading + // This implementation uses unintialized memory for loading linear segments + // into warp striped arrangement. + uint32_t subgroup_offset = item.get_sub_group().get_local_linear_id(); + uint32_t subgroup_size = item.get_sub_group().get_local_linear_range(); + uint32_t subgroup_idx = item.get_sub_group().get_group_linear_id(); + uint32_t initial_offset = + (subgroup_idx * ITEMS_PER_WORK_ITEM * subgroup_size) + subgroup_offset; +#pragma unroll + for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + new (&items[idx]) InputT(block_itr[initial_offset + (idx * subgroup_size)]); + } +} +// template parameters : +// ITEMS_PER_WORK_ITEM: size_t variable controlling the number of items per +// thread/work_item +// ALGORITHM: load_algorithm variable controlling the type of load operation. +// InputT: type for input sequence. +// InputIteratorT: input iterator type +// Item : typename parameter resembling sycl::nd_item<3> . +template +class workgroup_load { +public: + static size_t get_local_memory_size(size_t group_work_items) { return 0; } + workgroup_load(uint8_t *local_memory) : _local_memory(local_memory) {} + + __dpct_inline__ void load(const Item &item, InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_WORK_ITEM]) { + + if constexpr (ALGORITHM == BLOCK_LOAD_DIRECT) { + load_blocked(item, block_itr, items); + } else if constexpr (ALGORITHM == BLOCK_LOAD_STRIPED) { + load_striped(item, block_itr, items); + } + } + +private: + uint8_t *_local_memory; +}; +} // namespace group +} // namespace dpct + +#endif // __DPCT_GROUP_UTILS_HPP__ diff --git a/clang/test/dpct/check_header_files.cpp b/clang/test/dpct/check_header_files.cpp index b2574d75228e..f0f154dbcfd0 100644 --- a/clang/test/dpct/check_header_files.cpp +++ b/clang/test/dpct/check_header_files.cpp @@ -81,6 +81,11 @@ // RUN: echo "end" >> %T/check_header_files/diff_res.txt // RUN: FileCheck %s --match-full-lines --input-file %T/check_header_files/diff_res.txt +// RUN: echo "begin" > %T/check_header_files/diff_res.txt +// RUN: diff %T/check_header_files/out/include/dpct/group_utils.hpp %S/../../runtime/dpct-rt/include/dpct/group_utils.hpp >> %T/check_header_files/diff_res.txt +// RUN: echo "end" >> %T/check_header_files/diff_res.txt +// RUN: FileCheck %s --match-full-lines --input-file %T/check_header_files/diff_res.txt + // RUN: echo "begin" > %T/check_header_files/diff_res.txt // RUN: diff %T/check_header_files/out/include/dpct/dpl_extras/algorithm.h %S/../../runtime/dpct-rt/include/dpct/dpl_extras/algorithm.h >> %T/check_header_files/diff_res.txt // RUN: echo "end" >> %T/check_header_files/diff_res.txt