Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCLomatic] Block Store headers core #1819

Closed
wants to merge 27 commits into from
Closed
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
13d8b67
block store
abhilash1910 Mar 25, 2024
7517519
fix bug
abhilash1910 Mar 25, 2024
6b7fd09
update code
abhilash1910 May 10, 2024
454c453
fix template param
abhilash1910 May 10, 2024
9e75c62
Merge branch 'SYCLomatic' into block_store
abhilash1910 May 10, 2024
ffbd181
fix error
abhilash1910 May 14, 2024
a0007e1
Merge branch 'SYCLomatic' into block_store
abhilash1910 May 30, 2024
49147b8
add in group_utils
abhilash1910 May 30, 2024
18f826a
use class
abhilash1910 May 30, 2024
7149372
review commit
abhilash1910 May 30, 2024
431d4a4
format
abhilash1910 May 30, 2024
8cc73f1
review commit
abhilash1910 Jun 6, 2024
a677eb2
Merge branch 'oneapi-src:SYCLomatic' into block_store
abhilash1910 Jul 4, 2024
98d0193
clang-format
abhilash1910 Jul 4, 2024
79295f8
Merge branch 'oneapi-src:SYCLomatic' into block_store
abhilash1910 Jul 10, 2024
c4fe035
reorder template args for better visibility in parsing
abhilash1910 Jul 11, 2024
76ec684
revert template alignment
abhilash1910 Aug 12, 2024
41b1c8a
fix temps pointer
abhilash1910 Aug 12, 2024
b046dcc
rectify comment
abhilash1910 Aug 21, 2024
f86801d
Merge branch 'SYCLomatic' into block_store
abhilash1910 Aug 22, 2024
273d098
Update clang/runtime/dpct-rt/include/dpct/group_utils.hpp
abhilash1910 Aug 22, 2024
3185ceb
Update group_utils.hpp
abhilash1910 Aug 22, 2024
cc00403
fix review comments
abhilash1910 Aug 22, 2024
56c07e1
Merge branch 'SYCLomatic' into block_store
abhilash1910 Aug 26, 2024
28ff868
fix
abhilash1910 Aug 26, 2024
e87c0a6
Update clang/runtime/dpct-rt/include/dpct/group_utils.hpp
abhilash1910 Aug 26, 2024
1802fbe
update correct variables
abhilash1910 Aug 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 32 additions & 4 deletions clang/runtime/dpct-rt/include/dpct/group_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -708,11 +708,11 @@ class [[deprecated("Please use group_radix_sort instead")]] radix_sort {

/// Load linear segment items into block format across threads
/// Helper for Block Load
enum load_algorithm {
enum class load_algorithm {

BLOCK_LOAD_DIRECT,
BLOCK_LOAD_STRIPED,
};
abhilash1910 marked this conversation as resolved.
Show resolved Hide resolved

// loads a linear segment of workgroup items into a blocked arrangement.
template <size_t ITEMS_PER_WORK_ITEM, typename InputT, typename InputIteratorT,
typename Item>
Expand Down Expand Up @@ -869,6 +869,34 @@ uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr,
new (&items[idx]) InputT(block_itr[initial_offset + (idx * subgroup_size)]);
}
}

/// Stores a subgroup-striped arrangement of work items linear segment of items.
// Created as free function until exchange mechanism is
// implemented.
// To-do: inline this function with BLOCK_STORE_WARP_TRANSPOSE mechanism
template <typename T, size_t ElementsPerWorkItem, typename OutputIteratorT,
typename ItemT>
__dpct_inline__ void
store_subgroup_striped(const ItemT &item, OutputIteratorT block_itr,
T (&data)[ElementsPerWorkItem]) {

// This implementation does not take in account range storing across
// workgroup items To-do: Decide whether range storing is required for group
// loading
// This implementation loads linear segments into subgroup striped arrangement.
auto sub_group = item.get_sub_group();
uint32_t subgroup_offset = sub_group.get_local_linear_id();
uint32_t subgroup_size = sub_group.get_local_linear_range();
uint32_t subgroup_idx = sub_group.get_group_linear_id();
uint32_t initial_offset =
(subgroup_idx * ElementsPerWorkItem * subgroup_size) + subgroup_offset;
OutputIteratorT workitem_itr = block_itr + initial_offset;
#pragma unroll
for (uint32_t idx = 0; idx < ElementsPerWorkItem; idx++) {
workitem_itr[(idx * subgroup_size)] = data[idx];
}
}

// template parameters :
// ITEMS_PER_WORK_ITEM: size_t variable controlling the number of items per
// thread/work_item
Expand All @@ -887,9 +915,9 @@ class [[deprecated(
__dpct_inline__ void load(const Item &item, InputIteratorT block_itr,
InputT (&items)[ITEMS_PER_WORK_ITEM]) {

if constexpr (ALGORITHM == BLOCK_LOAD_DIRECT) {
if constexpr (ALGORITHM == load_algorithm::BLOCK_LOAD_DIRECT) {
load_blocked<ITEMS_PER_WORK_ITEM>(item, block_itr, items);
} else if constexpr (ALGORITHM == BLOCK_LOAD_STRIPED) {
} else if constexpr (ALGORITHM == load_algorithm::BLOCK_LOAD_STRIPED) {
load_striped<ITEMS_PER_WORK_ITEM>(item, block_itr, items);
}
}
Expand Down