From b2e55881f40b1fb1e422b095e68f28b5d148ae1a Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Date: Thu, 30 May 2024 08:58:10 +0530
Subject: [PATCH] [SYCLomatic][NFC] refactor code by creating a separate util
 file for groups (#1784)

Signed-off-by: Wang, Yihan <yihan.wang@intel.com>
Co-authored-by: Yihan Wang <yihan.wang@intel.com>
Co-authored-by: Dan Hoeflinger <109972525+danhoeflinger@users.noreply.github.com>
---
 clang/lib/DPCT/CMakeLists.txt                 |   2 +
 clang/lib/DPCT/GenHelperFunction.cpp          |   4 +
 clang/lib/DPCT/GenHelperFunction.h            |   1 +
 clang/lib/DPCT/HeaderTypes.inc                |   1 +
 clang/runtime/dpct-rt/CMakeLists.txt          |   1 +
 .../dpct/dpl_extras/dpcpp_extensions.h        | 514 -----------------
 .../dpct-rt/include/dpct/dpl_utils.hpp        |   1 +
 .../dpct-rt/include/dpct/group_utils.hpp      | 533 ++++++++++++++++++
 clang/test/dpct/check_header_files.cpp        |   5 +
 9 files changed, 548 insertions(+), 514 deletions(-)
 create mode 100644 clang/runtime/dpct-rt/include/dpct/group_utils.hpp

diff --git a/clang/lib/DPCT/CMakeLists.txt b/clang/lib/DPCT/CMakeLists.txt
index 8a827847b90c..f168bf409e53 100644
--- a/clang/lib/DPCT/CMakeLists.txt
+++ b/clang/lib/DPCT/CMakeLists.txt
@@ -22,6 +22,7 @@ set(RUNTIME_HEADERS
   ${CMAKE_SOURCE_DIR}/../clang/runtime/dpct-rt/include/dpct/sparse_utils.hpp
   ${CMAKE_SOURCE_DIR}/../clang/runtime/dpct-rt/include/dpct/fft_utils.hpp
   ${CMAKE_SOURCE_DIR}/../clang/runtime/dpct-rt/include/dpct/lapack_utils.hpp
+  ${CMAKE_SOURCE_DIR}/../clang/runtime/dpct-rt/include/dpct/group_utils.hpp
   ${CMAKE_SOURCE_DIR}/../clang/runtime/dpct-rt/include/dpct/dpl_extras/algorithm.h
   ${CMAKE_SOURCE_DIR}/../clang/runtime/dpct-rt/include/dpct/dpl_extras/functional.h
   ${CMAKE_SOURCE_DIR}/../clang/runtime/dpct-rt/include/dpct/dpl_extras/iterators.h
@@ -58,6 +59,7 @@ set(PROCESS_FILES_OUTPUT
   ${CMAKE_BINARY_DIR}/tools/clang/include/clang/DPCT/sparse_utils.hpp.inc
   ${CMAKE_BINARY_DIR}/tools/clang/include/clang/DPCT/fft_utils.hpp.inc
   ${CMAKE_BINARY_DIR}/tools/clang/include/clang/DPCT/lapack_utils.hpp.inc
+  ${CMAKE_BINARY_DIR}/tools/clang/include/clang/DPCT/group_utils.hpp.inc
 )
 
 add_custom_command(
diff --git a/clang/lib/DPCT/GenHelperFunction.cpp b/clang/lib/DPCT/GenHelperFunction.cpp
index 32ca482a1d12..688252a56164 100644
--- a/clang/lib/DPCT/GenHelperFunction.cpp
+++ b/clang/lib/DPCT/GenHelperFunction.cpp
@@ -73,6 +73,9 @@ const std::string FftUtilsAllContentStr =
 const std::string LapackUtilsAllContentStr =
 #include "clang/DPCT/lapack_utils.hpp.inc"
     ;
+const std::string GroupUtilsAllContentStr =
+#include "clang/DPCT/group_utils.hpp.inc"
+    ;
 const std::string DplExtrasAlgorithmAllContentStr =
 #include "clang/DPCT/dpl_extras/algorithm.h.inc"
     ;
@@ -162,6 +165,7 @@ void genHelperFunction(const clang::tooling::UnifiedPath &OutRoot) {
   GENERATE_ALL_FILE_CONTENT(SparseUtils, ".", sparse_utils.hpp)
   GENERATE_ALL_FILE_CONTENT(FftUtils, ".", fft_utils.hpp)
   GENERATE_ALL_FILE_CONTENT(LapackUtils, ".", lapack_utils.hpp)
+  GENERATE_ALL_FILE_CONTENT(GroupUtils, ".", group_utils.hpp)
   GENERATE_ALL_FILE_CONTENT(CodePin, "codepin", codepin.hpp)
   GENERATE_ALL_FILE_CONTENT(CodePinSerializationBasic, "codepin/serialization",
                             basic.hpp)
diff --git a/clang/lib/DPCT/GenHelperFunction.h b/clang/lib/DPCT/GenHelperFunction.h
index 6f3718695cfb..d0262848172d 100644
--- a/clang/lib/DPCT/GenHelperFunction.h
+++ b/clang/lib/DPCT/GenHelperFunction.h
@@ -32,6 +32,7 @@ extern const std::string CclUtilsAllContentStr;
 extern const std::string SparseUtilsAllContentStr;
 extern const std::string FftUtilsAllContentStr;
 extern const std::string LapackUtilsAllContentStr;
+extern const std::string GroupUtilsAllContentStr;
 extern const std::string DplExtrasAlgorithmAllContentStr;
 extern const std::string DplExtrasFunctionalAllContentStr;
 extern const std::string DplExtrasIteratorsAllContentStr;
diff --git a/clang/lib/DPCT/HeaderTypes.inc b/clang/lib/DPCT/HeaderTypes.inc
index 00050957ff81..e62256cc320d 100644
--- a/clang/lib/DPCT/HeaderTypes.inc
+++ b/clang/lib/DPCT/HeaderTypes.inc
@@ -65,6 +65,7 @@ DPCT_HEADER(CCL_Utils, "<dpct/ccl_utils.hpp>")
 DPCT_HEADER(BLAS_Utils, "<dpct/blas_utils.hpp>")
 DPCT_HEADER(DNNL_Utils, "<dpct/dnnl_utils.hpp>")
 DPCT_HEADER(LAPACK_Utils, "<dpct/lapack_utils.hpp>")
+DPCT_HEADER(GROUP_Utils, "<dpct/group_utils.hpp>")
 DPCT_HEADER(COMMON_Utils, "<dpct/lib_common_utils.hpp>")
 DPCT_HEADER(Atomic, "<dpct/atomic.hpp>")
 DPCT_HEADER(SPBLAS_Utils, "<dpct/sparse_utils.hpp>")
diff --git a/clang/runtime/dpct-rt/CMakeLists.txt b/clang/runtime/dpct-rt/CMakeLists.txt
index 63e9b630d030..50bfff5f055a 100644
--- a/clang/runtime/dpct-rt/CMakeLists.txt
+++ b/clang/runtime/dpct-rt/CMakeLists.txt
@@ -17,6 +17,7 @@ set(dpct_rt_files
   include/dpct/sparse_utils.hpp
   include/dpct/fft_utils.hpp
   include/dpct/lapack_utils.hpp
+  include/dpct/group_utils.hpp
 )
 
 set(dpct_rt_dpstd_files
diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h
index 15b0b1102290..d8472c5c275e 100644
--- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h
+++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h
@@ -149,520 +149,6 @@ exclusive_scan(const Item &item, T input, BinaryOperation binary_op,
   return output;
 }
 
-namespace detail {
-
-typedef uint16_t digit_counter_type;
-typedef uint32_t packed_counter_type;
-
-template <int N, int CURRENT_VAL = N, int COUNT = 0> struct log2 {
-  enum { VALUE = log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };
-};
-
-template <int N, int COUNT> struct log2<N, 0, COUNT> {
-  enum { VALUE = (1 << (COUNT - 1) < N) ? COUNT : COUNT - 1 };
-};
-
-template <int RADIX_BITS, bool DESCENDING = false> class radix_rank {
-public:
-  static size_t get_local_memory_size(size_t group_threads) {
-    return group_threads * PADDED_COUNTER_LANES * sizeof(packed_counter_type);
-  }
-
-  radix_rank(uint8_t *local_memory) : _local_memory(local_memory) {}
-
-  template <typename Item, int VALUES_PER_THREAD>
-  __dpct_inline__ void
-  rank_keys(const Item &item, uint32_t (&keys)[VALUES_PER_THREAD],
-            int (&ranks)[VALUES_PER_THREAD], int current_bit, int num_bits) {
-
-    digit_counter_type thread_prefixes[VALUES_PER_THREAD];
-    digit_counter_type *digit_counters[VALUES_PER_THREAD];
-    digit_counter_type *buffer =
-        reinterpret_cast<digit_counter_type *>(_local_memory);
-
-    reset_local_memory(item);
-
-    item.barrier(sycl::access::fence_space::local_space);
-
-#pragma unroll
-    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
-      uint32_t digit = ::dpct::bfe(keys[i], current_bit, num_bits);
-      uint32_t sub_counter = digit >> LOG_COUNTER_LANES;
-      uint32_t counter_lane = digit & (COUNTER_LANES - 1);
-
-      if (DESCENDING) {
-        sub_counter = PACKING_RATIO - 1 - sub_counter;
-        counter_lane = COUNTER_LANES - 1 - counter_lane;
-      }
-
-      digit_counters[i] =
-          &buffer[counter_lane * item.get_local_range().size() * PACKING_RATIO +
-                  item.get_local_linear_id() * PACKING_RATIO + sub_counter];
-      thread_prefixes[i] = *digit_counters[i];
-      *digit_counters[i] = thread_prefixes[i] + 1;
-    }
-
-    item.barrier(sycl::access::fence_space::local_space);
-
-    scan_counters(item);
-
-    item.barrier(sycl::access::fence_space::local_space);
-
-    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
-      ranks[i] = thread_prefixes[i] + *digit_counters[i];
-    }
-  }
-
-private:
-  template <typename Item>
-  __dpct_inline__ void reset_local_memory(const Item &item) {
-    packed_counter_type *ptr =
-        reinterpret_cast<packed_counter_type *>(_local_memory);
-
-#pragma unroll
-    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
-      ptr[i * item.get_local_range().size() + item.get_local_linear_id()] = 0;
-    }
-  }
-
-  template <typename Item>
-  __dpct_inline__ packed_counter_type upsweep(const Item &item) {
-    packed_counter_type sum = 0;
-    packed_counter_type *ptr =
-        reinterpret_cast<packed_counter_type *>(_local_memory);
-
-#pragma unroll
-    for (int i = 0; i < PADDED_COUNTER_LANES; i++) {
-      cached_segment[i] =
-          ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i];
-    }
-
-#pragma unroll
-    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
-      sum += cached_segment[i];
-    }
-
-    return sum;
-  }
-
-  template <typename Item>
-  __dpct_inline__ void
-  exclusive_downsweep(const Item &item, packed_counter_type raking_partial) {
-    packed_counter_type *ptr =
-        reinterpret_cast<packed_counter_type *>(_local_memory);
-    packed_counter_type sum = raking_partial;
-
-#pragma unroll
-    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
-      packed_counter_type value = cached_segment[i];
-      cached_segment[i] = sum;
-      sum += value;
-    }
-
-#pragma unroll
-    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
-      ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i] =
-          cached_segment[i];
-    }
-  }
-
-  struct prefix_callback {
-    __dpct_inline__ packed_counter_type
-    operator()(packed_counter_type block_aggregate) {
-      packed_counter_type block_prefix = 0;
-
-#pragma unroll
-      for (int packed = 1; packed < PACKING_RATIO; packed++) {
-        block_prefix += block_aggregate
-                        << (sizeof(digit_counter_type) * 8 * packed);
-      }
-
-      return block_prefix;
-    }
-  };
-
-  template <typename Item>
-  __dpct_inline__ void scan_counters(const Item &item) {
-    packed_counter_type raking_partial = upsweep(item);
-
-    prefix_callback callback;
-    packed_counter_type exclusive_partial = exclusive_scan(
-        item, raking_partial, sycl::ext::oneapi::plus<packed_counter_type>(),
-        callback);
-
-    exclusive_downsweep(item, exclusive_partial);
-  }
-
-private:
-  static constexpr int PACKING_RATIO =
-      sizeof(packed_counter_type) / sizeof(digit_counter_type);
-  static constexpr int LOG_PACKING_RATIO = log2<PACKING_RATIO>::VALUE;
-  static constexpr int LOG_COUNTER_LANES = RADIX_BITS - LOG_PACKING_RATIO;
-  static constexpr int COUNTER_LANES = 1 << LOG_COUNTER_LANES;
-  static constexpr int PADDED_COUNTER_LANES = COUNTER_LANES + 1;
-
-  packed_counter_type cached_segment[PADDED_COUNTER_LANES];
-  uint8_t *_local_memory;
-};
-
-template <typename T, typename U> struct base_traits {
-
-  static __dpct_inline__ U twiddle_in(U key) {
-    throw std::runtime_error("Not implemented");
-  }
-  static __dpct_inline__ U twiddle_out(U key) {
-    throw std::runtime_error("Not implemented");
-  }
-};
-
-template <typename U> struct base_traits<uint32_t, U> {
-  static __dpct_inline__ U twiddle_in(U key) { return key; }
-  static __dpct_inline__ U twiddle_out(U key) { return key; }
-};
-
-template <typename U> struct base_traits<int, U> {
-  static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1);
-  static __dpct_inline__ U twiddle_in(U key) { return key ^ HIGH_BIT; }
-  static __dpct_inline__ U twiddle_out(U key) { return key ^ HIGH_BIT; }
-};
-
-template <typename U> struct base_traits<float, U> {
-  static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1);
-  static __dpct_inline__ U twiddle_in(U key) {
-    U mask = (key & HIGH_BIT) ? U(-1) : HIGH_BIT;
-    return key ^ mask;
-  }
-  static __dpct_inline__ U twiddle_out(U key) {
-    U mask = (key & HIGH_BIT) ? HIGH_BIT : U(-1);
-    return key ^ mask;
-  }
-};
-
-template <typename T> struct traits : base_traits<T, T> {};
-template <> struct traits<uint32_t> : base_traits<uint32_t, uint32_t> {};
-template <> struct traits<int> : base_traits<int, uint32_t> {};
-template <> struct traits<float> : base_traits<float, uint32_t> {};
-
-} // namespace detail
-
-namespace detail {
-
-template <int N> struct power_of_two {
-  enum { VALUE = ((N & (N - 1)) == 0) };
-};
-
-__dpct_inline__ uint32_t shr_add(uint32_t x, uint32_t shift, uint32_t addend) {
-  return (x >> shift) + addend;
-}
-
-} // namespace detail
-
-/// Implements scatter to blocked exchange pattern used in radix sort algorithm.
-///
-/// \tparam T type of the data elements exchanges
-/// \tparam VALUES_PER_THREAD number of data elements assigned to a thread
-/// Implements blocked to striped exchange pattern
-template <typename T, int VALUES_PER_THREAD> class exchange {
-public:
-  static size_t get_local_memory_size(size_t group_threads) {
-    size_t padding_values =
-        (INSERT_PADDING)
-            ? ((group_threads * VALUES_PER_THREAD) >> LOG_LOCAL_MEMORY_BANKS)
-            : 0;
-    return (group_threads * VALUES_PER_THREAD + padding_values) * sizeof(T);
-  }
-
-  exchange(uint8_t *local_memory) : _local_memory(local_memory) {}
-
-  // TODO: Investigate if padding is required for performance,
-  // and if specializations are required for specific target hardware.
-  static size_t adjust_by_padding(size_t offset) {
-
-    if constexpr (INSERT_PADDING) {
-      offset = detail::shr_add(offset, LOG_LOCAL_MEMORY_BANKS, offset);
-    }
-    return offset;
-  }
-
-  struct blocked_offset {
-    template <typename Item> size_t operator()(Item item, size_t i) {
-      size_t offset = item.get_local_linear_id() * VALUES_PER_THREAD + i;
-      return adjust_by_padding(offset);
-    }
-  };
-
-  struct striped_offset {
-    template <typename Item> size_t operator()(Item item, size_t i) {
-      size_t offset = i * item.get_local_range(2) * item.get_local_range(1) *
-                          item.get_local_range(0) +
-                      item.get_local_linear_id();
-      return adjust_by_padding(offset);
-    }
-  };
-
-  template <typename Iterator> struct scatter_offset {
-    Iterator begin;
-    scatter_offset(const int (&ranks)[VALUES_PER_THREAD]) {
-      begin = std::begin(ranks);
-    }
-    template <typename Item> size_t operator()(Item item, size_t i) const {
-      // iterator i is expected to be within bounds [0,VALUES_PER_THREAD)
-      return adjust_by_padding(begin[i]);
-    }
-  };
-
-  template <typename Item, typename offsetFunctorTypeFW,
-            typename offsetFunctorTypeRV>
-  __dpct_inline__ void helper_exchange(Item item, T (&keys)[VALUES_PER_THREAD],
-                                       offsetFunctorTypeFW &offset_functor_fw,
-                                       offsetFunctorTypeRV &offset_functor_rv) {
-
-    T *buffer = reinterpret_cast<T *>(_local_memory);
-
-#pragma unroll
-    for (size_t i = 0; i < VALUES_PER_THREAD; i++) {
-      size_t offset = offset_functor_fw(item, i);
-      buffer[offset] = keys[i];
-    }
-
-    item.barrier(sycl::access::fence_space::local_space);
-
-#pragma unroll
-    for (size_t i = 0; i < VALUES_PER_THREAD; i++) {
-      size_t offset = offset_functor_rv(item, i);
-      keys[i] = buffer[offset];
-    }
-  }
-
-  /// Rearrange elements from blocked order to striped order
-  template <typename Item>
-  __dpct_inline__ void blocked_to_striped(Item item,
-                                          T (&keys)[VALUES_PER_THREAD]) {
-
-    striped_offset get_striped_offset;
-    blocked_offset get_blocked_offset;
-    helper_exchange(item, keys, get_blocked_offset, get_striped_offset);
-  }
-
-  /// Rearrange elements from striped order to blocked order
-  template <typename Item>
-  __dpct_inline__ void striped_to_blocked(Item item,
-                                          T (&keys)[VALUES_PER_THREAD]) {
-
-    blocked_offset get_blocked_offset;
-    striped_offset get_striped_offset;
-    helper_exchange(item, keys, get_striped_offset, get_blocked_offset);
-  }
-
-  /// Rearrange elements from rank order to blocked order
-  template <typename Item>
-  __dpct_inline__ void scatter_to_blocked(Item item,
-                                          T (&keys)[VALUES_PER_THREAD],
-                                          int (&ranks)[VALUES_PER_THREAD]) {
-
-    scatter_offset<const int *> get_scatter_offset(ranks);
-    blocked_offset get_blocked_offset;
-    helper_exchange(item, keys, get_scatter_offset, get_blocked_offset);
-  }
-
-  /// Rearrange elements from scatter order to striped order
-  template <typename Item>
-  __dpct_inline__ void scatter_to_striped(Item item,
-                                          T (&keys)[VALUES_PER_THREAD],
-                                          int (&ranks)[VALUES_PER_THREAD]) {
-
-    scatter_offset<const int *> get_scatter_offset(ranks);
-    striped_offset get_striped_offset;
-    helper_exchange(item, keys, get_scatter_offset, get_striped_offset);
-  }
-
-private:
-  static constexpr int LOG_LOCAL_MEMORY_BANKS = 4;
-  static constexpr bool INSERT_PADDING =
-      (VALUES_PER_THREAD > 4) &&
-      (detail::power_of_two<VALUES_PER_THREAD>::VALUE);
-
-  uint8_t *_local_memory;
-};
-
-/// Implements radix sort to sort integer data elements assigned to all threads
-/// in the group.
-///
-/// \tparam T type of the data elements exchanges
-/// \tparam VALUES_PER_THREAD number of data elements assigned to a thread
-/// \tparam DECENDING boolean value indicating if data elements are sorted in
-/// decending order.
-template <typename T, int VALUES_PER_THREAD, bool DESCENDING = false>
-class radix_sort {
-public:
-  static size_t get_local_memory_size(size_t group_threads) {
-    size_t ranks_size =
-        detail::radix_rank<RADIX_BITS>::get_local_memory_size(group_threads);
-    size_t exchange_size =
-        exchange<T, VALUES_PER_THREAD>::get_local_memory_size(group_threads);
-    return sycl::max(ranks_size, exchange_size);
-  }
-
-  radix_sort(uint8_t *local_memory) : _local_memory(local_memory) {}
-
-  template <typename Item>
-  __dpct_inline__ void
-  helper_sort(const Item &item, T (&keys)[VALUES_PER_THREAD], int begin_bit = 0,
-              int end_bit = 8 * sizeof(T), bool is_striped = false) {
-
-    uint32_t(&unsigned_keys)[VALUES_PER_THREAD] =
-        reinterpret_cast<uint32_t(&)[VALUES_PER_THREAD]>(keys);
-
-#pragma unroll
-    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
-      unsigned_keys[i] = detail::traits<T>::twiddle_in(unsigned_keys[i]);
-    }
-
-    for (int i = begin_bit; i < end_bit; i += RADIX_BITS) {
-      int pass_bits = sycl::min(RADIX_BITS, end_bit - begin_bit);
-
-      int ranks[VALUES_PER_THREAD];
-      detail::radix_rank<RADIX_BITS, DESCENDING>(_local_memory)
-          .template rank_keys(item, unsigned_keys, ranks, i, pass_bits);
-
-      item.barrier(sycl::access::fence_space::local_space);
-
-      bool last_iter = i + RADIX_BITS > end_bit;
-      if (last_iter && is_striped) {
-        exchange<T, VALUES_PER_THREAD>(_local_memory)
-            .scatter_to_striped(item, keys, ranks);
-
-      } else {
-        exchange<T, VALUES_PER_THREAD>(_local_memory)
-            .scatter_to_blocked(item, keys, ranks);
-      }
-
-      item.barrier(sycl::access::fence_space::local_space);
-    }
-
-#pragma unroll
-    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
-      unsigned_keys[i] = detail::traits<T>::twiddle_out(unsigned_keys[i]);
-    }
-  }
-
-  template <typename Item>
-  __dpct_inline__ void
-  sort_blocked(const Item &item, T (&keys)[VALUES_PER_THREAD],
-               int begin_bit = 0, int end_bit = 8 * sizeof(T)) {
-    helper_sort(item, keys, begin_bit, end_bit, false);
-  }
-
-  template <typename Item>
-  __dpct_inline__ void
-  sort_blocked_to_striped(const Item &item, T (&keys)[VALUES_PER_THREAD],
-                          int begin_bit = 0, int end_bit = 8 * sizeof(T)) {
-    helper_sort(item, keys, begin_bit, end_bit, true);
-  }
-
-private:
-  static constexpr int RADIX_BITS = 4;
-
-  uint8_t *_local_memory;
-};
-
-/// Load linear segment items into block format across threads
-/// Helper for Block Load
-enum load_algorithm {
-
-  BLOCK_LOAD_DIRECT,
-  BLOCK_LOAD_STRIPED,
-  // To-do: BLOCK_LOAD_WARP_TRANSPOSE
-
-};
-
-// loads a linear segment of workgroup items into a blocked arrangement.
-template <size_t ITEMS_PER_WORK_ITEM, typename InputT, typename InputIteratorT,
-          typename Item>
-__dpct_inline__ void load_blocked(const Item &item, InputIteratorT block_itr,
-                                  InputT (&items)[ITEMS_PER_WORK_ITEM]) {
-
-  // This implementation does not take in account range loading across
-  // workgroup items To-do: Decide whether range loading is required for group
-  // loading
-  size_t linear_tid = item.get_local_linear_id();
-  uint32_t workgroup_offset = linear_tid * ITEMS_PER_WORK_ITEM;
-#pragma unroll
-  for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) {
-    items[idx] = block_itr[workgroup_offset + idx];
-  }
-}
-
-// loads a linear segment of workgroup items into a striped arrangement.
-template <size_t ITEMS_PER_WORK_ITEM, typename InputT, typename InputIteratorT,
-          typename Item>
-__dpct_inline__ void load_striped(const Item &item, InputIteratorT block_itr,
-                                  InputT (&items)[ITEMS_PER_WORK_ITEM]) {
-
-  // This implementation does not take in account range loading across
-  // workgroup items To-do: Decide whether range loading is required for group
-  // loading
-  size_t linear_tid = item.get_local_linear_id();
-  size_t group_work_items = item.get_local_range().size();
-#pragma unroll
-  for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) {
-    items[idx] = block_itr[linear_tid + (idx * group_work_items)];
-  }
-}
-
-// loads a linear segment of workgroup items into a subgroup striped
-// arrangement. Created as free function until exchange mechanism is
-// implemented.
-// To-do: inline this function with BLOCK_LOAD_WARP_TRANSPOSE mechanism
-template <size_t ITEMS_PER_WORK_ITEM, typename InputT, typename InputIteratorT,
-          typename Item>
-__dpct_inline__ void
-uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr,
-                                    InputT (&items)[ITEMS_PER_WORK_ITEM]) {
-
-  // This implementation does not take in account range loading across
-  // workgroup items To-do: Decide whether range loading is required for group
-  // loading
-  // This implementation uses unintialized memory for loading linear segments
-  // into warp striped arrangement.
-  uint32_t subgroup_offset = item.get_sub_group().get_local_linear_id();
-  uint32_t subgroup_size = item.get_sub_group().get_local_linear_range();
-  uint32_t subgroup_idx = item.get_sub_group().get_group_linear_id();
-  uint32_t initial_offset =
-      (subgroup_idx * ITEMS_PER_WORK_ITEM * subgroup_size) + subgroup_offset;
-#pragma unroll
-  for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) {
-    new (&items[idx]) InputT(block_itr[initial_offset + (idx * subgroup_size)]);
-  }
-}
-// template parameters :
-// ITEMS_PER_WORK_ITEM: size_t variable controlling the number of items per
-// thread/work_item
-// ALGORITHM: load_algorithm variable controlling the type of load operation.
-// InputT: type for input sequence.
-// InputIteratorT:  input iterator type
-// Item : typename parameter resembling sycl::nd_item<3> .
-template <size_t ITEMS_PER_WORK_ITEM, load_algorithm ALGORITHM, typename InputT,
-          typename InputIteratorT, typename Item>
-class workgroup_load {
-public:
-  static size_t get_local_memory_size(size_t group_work_items) { return 0; }
-  workgroup_load(uint8_t *local_memory) : _local_memory(local_memory) {}
-
-  __dpct_inline__ void load(const Item &item, InputIteratorT block_itr,
-                            InputT (&items)[ITEMS_PER_WORK_ITEM]) {
-
-    if constexpr (ALGORITHM == BLOCK_LOAD_DIRECT) {
-      load_blocked<ITEMS_PER_WORK_ITEM>(item, block_itr, items);
-    } else if constexpr (ALGORITHM == BLOCK_LOAD_STRIPED) {
-      load_striped<ITEMS_PER_WORK_ITEM>(item, block_itr, items);
-    }
-  }
-
-private:
-  uint8_t *_local_memory;
-};
-
 /// Perform a reduction of the data elements assigned to all threads in the
 /// group.
 ///
diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_utils.hpp b/clang/runtime/dpct-rt/include/dpct/dpl_utils.hpp
index 1440b4250ece..efc5940c6665 100644
--- a/clang/runtime/dpct-rt/include/dpct/dpl_utils.hpp
+++ b/clang/runtime/dpct-rt/include/dpct/dpl_utils.hpp
@@ -23,6 +23,7 @@
 #include "dpl_extras/vector.h"
 #include "dpl_extras/dpcpp_extensions.h"
 
+#include "group_utils.hpp"
 // Only include iterator adaptor (and therefore boost) if necessary
 #ifdef ITERATOR_ADAPTOR_REQUIRED
 #include "dpl_extras/iterator_adaptor.h"
diff --git a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp
new file mode 100644
index 000000000000..9f0be34ab87f
--- /dev/null
+++ b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp
@@ -0,0 +1,533 @@
+//==---- group_utils.hpp ------------------*- C++ -*--------------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===------------------------------------------------------------------===//
+
+#ifndef __DPCT_GROUP_UTILS_HPP__
+#define __DPCT_GROUP_UTILS_HPP__
+
+#include <stdexcept>
+#include <sycl/sycl.hpp>
+
+#include "dpct.hpp"
+#include "dpl_extras/functional.h"
+
+namespace dpct {
+namespace group {
+
+namespace detail {
+
+typedef uint16_t digit_counter_type;
+typedef uint32_t packed_counter_type;
+
+template <int N, int CURRENT_VAL = N, int COUNT = 0> struct log2 {
+  enum { VALUE = log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };
+};
+
+template <int N, int COUNT> struct log2<N, 0, COUNT> {
+  enum { VALUE = (1 << (COUNT - 1) < N) ? COUNT : COUNT - 1 };
+};
+
+template <int RADIX_BITS, bool DESCENDING = false> class radix_rank {
+public:
+  static size_t get_local_memory_size(size_t group_threads) {
+    return group_threads * PADDED_COUNTER_LANES * sizeof(packed_counter_type);
+  }
+
+  radix_rank(uint8_t *local_memory) : _local_memory(local_memory) {}
+
+  template <typename Item, int VALUES_PER_THREAD>
+  __dpct_inline__ void
+  rank_keys(const Item &item, uint32_t (&keys)[VALUES_PER_THREAD],
+            int (&ranks)[VALUES_PER_THREAD], int current_bit, int num_bits) {
+
+    digit_counter_type thread_prefixes[VALUES_PER_THREAD];
+    digit_counter_type *digit_counters[VALUES_PER_THREAD];
+    digit_counter_type *buffer =
+        reinterpret_cast<digit_counter_type *>(_local_memory);
+
+    reset_local_memory(item);
+
+    item.barrier(sycl::access::fence_space::local_space);
+
+#pragma unroll
+    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
+      uint32_t digit = ::dpct::bfe(keys[i], current_bit, num_bits);
+      uint32_t sub_counter = digit >> LOG_COUNTER_LANES;
+      uint32_t counter_lane = digit & (COUNTER_LANES - 1);
+
+      if (DESCENDING) {
+        sub_counter = PACKING_RATIO - 1 - sub_counter;
+        counter_lane = COUNTER_LANES - 1 - counter_lane;
+      }
+
+      digit_counters[i] =
+          &buffer[counter_lane * item.get_local_range().size() * PACKING_RATIO +
+                  item.get_local_linear_id() * PACKING_RATIO + sub_counter];
+      thread_prefixes[i] = *digit_counters[i];
+      *digit_counters[i] = thread_prefixes[i] + 1;
+    }
+
+    item.barrier(sycl::access::fence_space::local_space);
+
+    scan_counters(item);
+
+    item.barrier(sycl::access::fence_space::local_space);
+
+    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
+      ranks[i] = thread_prefixes[i] + *digit_counters[i];
+    }
+  }
+
+private:
+  template <typename Item>
+  __dpct_inline__ void reset_local_memory(const Item &item) {
+    packed_counter_type *ptr =
+        reinterpret_cast<packed_counter_type *>(_local_memory);
+
+#pragma unroll
+    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
+      ptr[i * item.get_local_range().size() + item.get_local_linear_id()] = 0;
+    }
+  }
+
+  template <typename Item>
+  __dpct_inline__ packed_counter_type upsweep(const Item &item) {
+    packed_counter_type sum = 0;
+    packed_counter_type *ptr =
+        reinterpret_cast<packed_counter_type *>(_local_memory);
+
+#pragma unroll
+    for (int i = 0; i < PADDED_COUNTER_LANES; i++) {
+      cached_segment[i] =
+          ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i];
+    }
+
+#pragma unroll
+    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
+      sum += cached_segment[i];
+    }
+
+    return sum;
+  }
+
+  template <typename Item>
+  __dpct_inline__ void exclusive_downsweep(const Item &item,
+                                           packed_counter_type raking_partial) {
+    packed_counter_type *ptr =
+        reinterpret_cast<packed_counter_type *>(_local_memory);
+    packed_counter_type sum = raking_partial;
+
+#pragma unroll
+    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
+      packed_counter_type value = cached_segment[i];
+      cached_segment[i] = sum;
+      sum += value;
+    }
+
+#pragma unroll
+    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
+      ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i] =
+          cached_segment[i];
+    }
+  }
+
+  struct prefix_callback {
+    __dpct_inline__ packed_counter_type
+    operator()(packed_counter_type block_aggregate) {
+      packed_counter_type block_prefix = 0;
+
+#pragma unroll
+      for (int packed = 1; packed < PACKING_RATIO; packed++) {
+        block_prefix += block_aggregate
+                        << (sizeof(digit_counter_type) * 8 * packed);
+      }
+
+      return block_prefix;
+    }
+  };
+
+  template <typename Item>
+  __dpct_inline__ void scan_counters(const Item &item) {
+    packed_counter_type raking_partial = upsweep(item);
+
+    prefix_callback callback;
+    packed_counter_type exclusive_partial = exclusive_scan(
+        item, raking_partial, sycl::ext::oneapi::plus<packed_counter_type>(),
+        callback);
+
+    exclusive_downsweep(item, exclusive_partial);
+  }
+
+private:
+  static constexpr int PACKING_RATIO =
+      sizeof(packed_counter_type) / sizeof(digit_counter_type);
+  static constexpr int LOG_PACKING_RATIO = log2<PACKING_RATIO>::VALUE;
+  static constexpr int LOG_COUNTER_LANES = RADIX_BITS - LOG_PACKING_RATIO;
+  static constexpr int COUNTER_LANES = 1 << LOG_COUNTER_LANES;
+  static constexpr int PADDED_COUNTER_LANES = COUNTER_LANES + 1;
+
+  packed_counter_type cached_segment[PADDED_COUNTER_LANES];
+  uint8_t *_local_memory;
+};
+
+template <typename T, typename U> struct base_traits {
+
+  static __dpct_inline__ U twiddle_in(U key) {
+    throw std::runtime_error("Not implemented");
+  }
+  static __dpct_inline__ U twiddle_out(U key) {
+    throw std::runtime_error("Not implemented");
+  }
+};
+
+template <typename U> struct base_traits<uint32_t, U> {
+  static __dpct_inline__ U twiddle_in(U key) { return key; }
+  static __dpct_inline__ U twiddle_out(U key) { return key; }
+};
+
+template <typename U> struct base_traits<int, U> {
+  static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1);
+  static __dpct_inline__ U twiddle_in(U key) { return key ^ HIGH_BIT; }
+  static __dpct_inline__ U twiddle_out(U key) { return key ^ HIGH_BIT; }
+};
+
+template <typename U> struct base_traits<float, U> {
+  static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1);
+  static __dpct_inline__ U twiddle_in(U key) {
+    U mask = (key & HIGH_BIT) ? U(-1) : HIGH_BIT;
+    return key ^ mask;
+  }
+  static __dpct_inline__ U twiddle_out(U key) {
+    U mask = (key & HIGH_BIT) ? HIGH_BIT : U(-1);
+    return key ^ mask;
+  }
+};
+
+template <typename T> struct traits : base_traits<T, T> {};
+template <> struct traits<uint32_t> : base_traits<uint32_t, uint32_t> {};
+template <> struct traits<int> : base_traits<int, uint32_t> {};
+template <> struct traits<float> : base_traits<float, uint32_t> {};
+
+template <int N> struct power_of_two {
+  enum { VALUE = ((N & (N - 1)) == 0) };
+};
+
+__dpct_inline__ uint32_t shr_add(uint32_t x, uint32_t shift, uint32_t addend) {
+  return (x >> shift) + addend;
+}
+
+} // namespace detail
+
+/// Implements scatter to blocked exchange pattern used in radix sort algorithm.
+///
+/// \tparam T type of the data elements exchanges
+/// \tparam VALUES_PER_THREAD number of data elements assigned to a thread
+/// Implements blocked to striped exchange pattern
+template <typename T, int VALUES_PER_THREAD> class exchange {
+public:
+  static size_t get_local_memory_size(size_t group_threads) {
+    size_t padding_values =
+        (INSERT_PADDING)
+            ? ((group_threads * VALUES_PER_THREAD) >> LOG_LOCAL_MEMORY_BANKS)
+            : 0;
+    return (group_threads * VALUES_PER_THREAD + padding_values) * sizeof(T);
+  }
+
+  exchange(uint8_t *local_memory) : _local_memory(local_memory) {}
+
+  // TODO: Investigate if padding is required for performance,
+  // and if specializations are required for specific target hardware.
+  static size_t adjust_by_padding(size_t offset) {
+
+    if constexpr (INSERT_PADDING) {
+      offset = detail::shr_add(offset, LOG_LOCAL_MEMORY_BANKS, offset);
+    }
+    return offset;
+  }
+
+  struct blocked_offset {
+    template <typename Item> size_t operator()(Item item, size_t i) {
+      size_t offset = item.get_local_linear_id() * VALUES_PER_THREAD + i;
+      return adjust_by_padding(offset);
+    }
+  };
+
+  struct striped_offset {
+    template <typename Item> size_t operator()(Item item, size_t i) {
+      size_t offset = i * item.get_local_range(2) * item.get_local_range(1) *
+                          item.get_local_range(0) +
+                      item.get_local_linear_id();
+      return adjust_by_padding(offset);
+    }
+  };
+
+  template <typename Iterator> struct scatter_offset {
+    Iterator begin;
+    scatter_offset(const int (&ranks)[VALUES_PER_THREAD]) {
+      begin = std::begin(ranks);
+    }
+    template <typename Item> size_t operator()(Item item, size_t i) const {
+      // iterator i is expected to be within bounds [0,VALUES_PER_THREAD)
+      return adjust_by_padding(begin[i]);
+    }
+  };
+
+  template <typename Item, typename offsetFunctorTypeFW,
+            typename offsetFunctorTypeRV>
+  __dpct_inline__ void helper_exchange(Item item, T (&keys)[VALUES_PER_THREAD],
+                                       offsetFunctorTypeFW &offset_functor_fw,
+                                       offsetFunctorTypeRV &offset_functor_rv) {
+
+    T *buffer = reinterpret_cast<T *>(_local_memory);
+
+#pragma unroll
+    for (size_t i = 0; i < VALUES_PER_THREAD; i++) {
+      size_t offset = offset_functor_fw(item, i);
+      buffer[offset] = keys[i];
+    }
+
+    item.barrier(sycl::access::fence_space::local_space);
+
+#pragma unroll
+    for (size_t i = 0; i < VALUES_PER_THREAD; i++) {
+      size_t offset = offset_functor_rv(item, i);
+      keys[i] = buffer[offset];
+    }
+  }
+
+  /// Rearrange elements from blocked order to striped order
+  template <typename Item>
+  __dpct_inline__ void blocked_to_striped(Item item,
+                                          T (&keys)[VALUES_PER_THREAD]) {
+
+    striped_offset get_striped_offset;
+    blocked_offset get_blocked_offset;
+    helper_exchange(item, keys, get_blocked_offset, get_striped_offset);
+  }
+
+  /// Rearrange elements from striped order to blocked order
+  template <typename Item>
+  __dpct_inline__ void striped_to_blocked(Item item,
+                                          T (&keys)[VALUES_PER_THREAD]) {
+
+    blocked_offset get_blocked_offset;
+    striped_offset get_striped_offset;
+    helper_exchange(item, keys, get_striped_offset, get_blocked_offset);
+  }
+
+  /// Rearrange elements from rank order to blocked order
+  template <typename Item>
+  __dpct_inline__ void scatter_to_blocked(Item item,
+                                          T (&keys)[VALUES_PER_THREAD],
+                                          int (&ranks)[VALUES_PER_THREAD]) {
+
+    scatter_offset<const int *> get_scatter_offset(ranks);
+    blocked_offset get_blocked_offset;
+    helper_exchange(item, keys, get_scatter_offset, get_blocked_offset);
+  }
+
+  /// Rearrange elements from scatter order to striped order
+  template <typename Item>
+  __dpct_inline__ void scatter_to_striped(Item item,
+                                          T (&keys)[VALUES_PER_THREAD],
+                                          int (&ranks)[VALUES_PER_THREAD]) {
+
+    scatter_offset<const int *> get_scatter_offset(ranks);
+    striped_offset get_striped_offset;
+    helper_exchange(item, keys, get_scatter_offset, get_striped_offset);
+  }
+
+private:
+  static constexpr int LOG_LOCAL_MEMORY_BANKS = 4;
+  static constexpr bool INSERT_PADDING =
+      (VALUES_PER_THREAD > 4) &&
+      (detail::power_of_two<VALUES_PER_THREAD>::VALUE);
+
+  uint8_t *_local_memory;
+};
+
+/// Implements radix sort to sort integer data elements assigned to all threads
+/// in the group.
+///
+/// \tparam T type of the data elements exchanges
+/// \tparam VALUES_PER_THREAD number of data elements assigned to a thread
+/// \tparam DECENDING boolean value indicating if data elements are sorted in
+/// decending order.
+template <typename T, int VALUES_PER_THREAD, bool DESCENDING = false>
+class radix_sort {
+public:
+  static size_t get_local_memory_size(size_t group_threads) {
+    size_t ranks_size =
+        detail::radix_rank<RADIX_BITS>::get_local_memory_size(group_threads);
+    size_t exchange_size =
+        exchange<T, VALUES_PER_THREAD>::get_local_memory_size(group_threads);
+    return sycl::max(ranks_size, exchange_size);
+  }
+
+  radix_sort(uint8_t *local_memory) : _local_memory(local_memory) {}
+
+  template <typename Item>
+  __dpct_inline__ void
+  helper_sort(const Item &item, T (&keys)[VALUES_PER_THREAD], int begin_bit = 0,
+              int end_bit = 8 * sizeof(T), bool is_striped = false) {
+
+    uint32_t(&unsigned_keys)[VALUES_PER_THREAD] =
+        reinterpret_cast<uint32_t(&)[VALUES_PER_THREAD]>(keys);
+
+#pragma unroll
+    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
+      unsigned_keys[i] = detail::traits<T>::twiddle_in(unsigned_keys[i]);
+    }
+
+    for (int i = begin_bit; i < end_bit; i += RADIX_BITS) {
+      int pass_bits = sycl::min(RADIX_BITS, end_bit - begin_bit);
+
+      int ranks[VALUES_PER_THREAD];
+      detail::radix_rank<RADIX_BITS, DESCENDING>(_local_memory)
+          .template rank_keys(item, unsigned_keys, ranks, i, pass_bits);
+
+      item.barrier(sycl::access::fence_space::local_space);
+
+      bool last_iter = i + RADIX_BITS > end_bit;
+      if (last_iter && is_striped) {
+        exchange<T, VALUES_PER_THREAD>(_local_memory)
+            .scatter_to_striped(item, keys, ranks);
+
+      } else {
+        exchange<T, VALUES_PER_THREAD>(_local_memory)
+            .scatter_to_blocked(item, keys, ranks);
+      }
+
+      item.barrier(sycl::access::fence_space::local_space);
+    }
+
+#pragma unroll
+    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
+      unsigned_keys[i] = detail::traits<T>::twiddle_out(unsigned_keys[i]);
+    }
+  }
+
+  template <typename Item>
+  __dpct_inline__ void
+  sort_blocked(const Item &item, T (&keys)[VALUES_PER_THREAD],
+               int begin_bit = 0, int end_bit = 8 * sizeof(T)) {
+    helper_sort(item, keys, begin_bit, end_bit, false);
+  }
+
+  template <typename Item>
+  __dpct_inline__ void
+  sort_blocked_to_striped(const Item &item, T (&keys)[VALUES_PER_THREAD],
+                          int begin_bit = 0, int end_bit = 8 * sizeof(T)) {
+    helper_sort(item, keys, begin_bit, end_bit, true);
+  }
+
+private:
+  static constexpr int RADIX_BITS = 4;
+
+  uint8_t *_local_memory;
+};
+
+/// Load linear segment items into block format across threads
+/// Helper for Block Load
+enum load_algorithm {
+
+  BLOCK_LOAD_DIRECT,
+  BLOCK_LOAD_STRIPED,
+  // To-do: BLOCK_LOAD_WARP_TRANSPOSE
+
+};
+
+// loads a linear segment of workgroup items into a blocked arrangement.
+template <size_t ITEMS_PER_WORK_ITEM, typename InputT, typename InputIteratorT,
+          typename Item>
+__dpct_inline__ void load_blocked(const Item &item, InputIteratorT block_itr,
+                                  InputT (&items)[ITEMS_PER_WORK_ITEM]) {
+
+  // This implementation does not take in account range loading across
+  // workgroup items To-do: Decide whether range loading is required for group
+  // loading
+  size_t linear_tid = item.get_local_linear_id();
+  uint32_t workgroup_offset = linear_tid * ITEMS_PER_WORK_ITEM;
+#pragma unroll
+  for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) {
+    items[idx] = block_itr[workgroup_offset + idx];
+  }
+}
+
+// loads a linear segment of workgroup items into a striped arrangement.
+template <size_t ITEMS_PER_WORK_ITEM, typename InputT, typename InputIteratorT,
+          typename Item>
+__dpct_inline__ void load_striped(const Item &item, InputIteratorT block_itr,
+                                  InputT (&items)[ITEMS_PER_WORK_ITEM]) {
+
+  // This implementation does not take in account range loading across
+  // workgroup items To-do: Decide whether range loading is required for group
+  // loading
+  size_t linear_tid = item.get_local_linear_id();
+  size_t group_work_items = item.get_local_range().size();
+#pragma unroll
+  for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) {
+    items[idx] = block_itr[linear_tid + (idx * group_work_items)];
+  }
+}
+
+// loads a linear segment of workgroup items into a subgroup striped
+// arrangement. Created as free function until exchange mechanism is
+// implemented.
+// To-do: inline this function with BLOCK_LOAD_WARP_TRANSPOSE mechanism
+template <size_t ITEMS_PER_WORK_ITEM, typename InputT, typename InputIteratorT,
+          typename Item>
+__dpct_inline__ void
+uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr,
+                                    InputT (&items)[ITEMS_PER_WORK_ITEM]) {
+
+  // This implementation does not take in account range loading across
+  // workgroup items To-do: Decide whether range loading is required for group
+  // loading
+  // This implementation uses unintialized memory for loading linear segments
+  // into warp striped arrangement.
+  uint32_t subgroup_offset = item.get_sub_group().get_local_linear_id();
+  uint32_t subgroup_size = item.get_sub_group().get_local_linear_range();
+  uint32_t subgroup_idx = item.get_sub_group().get_group_linear_id();
+  uint32_t initial_offset =
+      (subgroup_idx * ITEMS_PER_WORK_ITEM * subgroup_size) + subgroup_offset;
+#pragma unroll
+  for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) {
+    new (&items[idx]) InputT(block_itr[initial_offset + (idx * subgroup_size)]);
+  }
+}
+// template parameters :
+// ITEMS_PER_WORK_ITEM: size_t variable controlling the number of items per
+// thread/work_item
+// ALGORITHM: load_algorithm variable controlling the type of load operation.
+// InputT: type for input sequence.
+// InputIteratorT:  input iterator type
+// Item : typename parameter resembling sycl::nd_item<3> .
+template <size_t ITEMS_PER_WORK_ITEM, load_algorithm ALGORITHM, typename InputT,
+          typename InputIteratorT, typename Item>
+class workgroup_load {
+public:
+  static size_t get_local_memory_size(size_t group_work_items) { return 0; }
+  workgroup_load(uint8_t *local_memory) : _local_memory(local_memory) {}
+
+  __dpct_inline__ void load(const Item &item, InputIteratorT block_itr,
+                            InputT (&items)[ITEMS_PER_WORK_ITEM]) {
+
+    if constexpr (ALGORITHM == BLOCK_LOAD_DIRECT) {
+      load_blocked<ITEMS_PER_WORK_ITEM>(item, block_itr, items);
+    } else if constexpr (ALGORITHM == BLOCK_LOAD_STRIPED) {
+      load_striped<ITEMS_PER_WORK_ITEM>(item, block_itr, items);
+    }
+  }
+
+private:
+  uint8_t *_local_memory;
+};
+} // namespace group
+} // namespace dpct
+
+#endif // __DPCT_GROUP_UTILS_HPP__
diff --git a/clang/test/dpct/check_header_files.cpp b/clang/test/dpct/check_header_files.cpp
index b2574d75228e..f0f154dbcfd0 100644
--- a/clang/test/dpct/check_header_files.cpp
+++ b/clang/test/dpct/check_header_files.cpp
@@ -81,6 +81,11 @@
 // RUN: echo "end" >> %T/check_header_files/diff_res.txt
 // RUN: FileCheck %s --match-full-lines --input-file %T/check_header_files/diff_res.txt
 
+// RUN: echo "begin" > %T/check_header_files/diff_res.txt
+// RUN: diff %T/check_header_files/out/include/dpct/group_utils.hpp  %S/../../runtime/dpct-rt/include/dpct/group_utils.hpp >> %T/check_header_files/diff_res.txt
+// RUN: echo "end" >> %T/check_header_files/diff_res.txt
+// RUN: FileCheck %s --match-full-lines --input-file %T/check_header_files/diff_res.txt
+
 // RUN: echo "begin" > %T/check_header_files/diff_res.txt
 // RUN: diff %T/check_header_files/out/include/dpct/dpl_extras/algorithm.h  %S/../../runtime/dpct-rt/include/dpct/dpl_extras/algorithm.h >> %T/check_header_files/diff_res.txt
 // RUN: echo "end" >> %T/check_header_files/diff_res.txt