Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCLomatic] Block Load headers core #1640

Merged
Merged
Changes from 61 commits
Commits
Show all changes
76 commits
Select commit Hold shift + click to select a range
bde928e
load_helpers
abhilash1910 Jan 18, 2024
8717079
add load apis
abhilash1910 Jan 29, 2024
1549642
clang-format
abhilash1910 Jan 29, 2024
41d994e
reviews 1
abhilash1910 Jan 30, 2024
99df7d9
fix lit
abhilash1910 Jan 30, 2024
789bd18
update warp striped logic
abhilash1910 Jan 30, 2024
7872841
rename warp to subgroup
abhilash1910 Jan 30, 2024
ec7a718
clang-format
abhilash1910 Jan 30, 2024
fe4c38e
review commits 2
abhilash1910 Jan 31, 2024
a4e2316
fix lit
abhilash1910 Jan 31, 2024
1a0b447
Merge branch 'oneapi-src:SYCLomatic' into block_load_store_headers
abhilash1910 Jan 31, 2024
c5e4fad
Merge branch 'oneapi-src:SYCLomatic' into block_load_store_headers
abhilash1910 Feb 1, 2024
95edd0e
clang-format
abhilash1910 Feb 1, 2024
1419253
update review 1
abhilash1910 Feb 2, 2024
6f99026
review commit 2
abhilash1910 Feb 2, 2024
eb5539a
fix dtype
abhilash1910 Feb 2, 2024
8ce2b68
Merge branch 'oneapi-src:SYCLomatic' into block_load_store_headers
abhilash1910 Feb 5, 2024
871c6c2
rm if stmt
abhilash1910 Feb 5, 2024
e8fc26e
change design , use class
abhilash1910 Feb 5, 2024
0f7b5e4
bug fix
abhilash1910 Feb 5, 2024
93db62a
remove loop assignment
abhilash1910 Feb 5, 2024
2d78e9a
use pseudocode
abhilash1910 Feb 6, 2024
1276698
review commit 1
abhilash1910 Feb 6, 2024
8d43351
add load method
abhilash1910 Feb 6, 2024
3d22cd7
refactor logic
abhilash1910 Feb 9, 2024
0b32a44
fix bug
abhilash1910 Feb 9, 2024
e24ebb6
refactor post review
abhilash1910 Feb 14, 2024
293bf14
compile time branch
abhilash1910 Feb 14, 2024
721e722
update comments
abhilash1910 Feb 14, 2024
a164256
fix bugs
abhilash1910 Feb 14, 2024
0dc3fa0
clang-format
abhilash1910 Feb 14, 2024
59b881e
review commits
abhilash1910 Feb 26, 2024
b6f123c
fix format
abhilash1910 Feb 26, 2024
c0d96f5
Merge branch 'oneapi-src:SYCLomatic' into block_load_store_headers
abhilash1910 Feb 26, 2024
5436755
review commit
abhilash1910 Feb 29, 2024
118bcc1
fix bugs
abhilash1910 Feb 29, 2024
7060821
review commits
abhilash1910 Mar 1, 2024
aa6268a
Merge branch 'oneapi-src:SYCLomatic' into block_load_store_headers
abhilash1910 Mar 4, 2024
48677b9
format
abhilash1910 Mar 4, 2024
2e66d3c
Merge branch 'oneapi-src:SYCLomatic' into block_load_store_headers
abhilash1910 Mar 5, 2024
70d5d27
Merge branch 'oneapi-src:SYCLomatic' into block_load_store_headers
abhilash1910 Mar 6, 2024
7e50327
remove redundant template arg
abhilash1910 Mar 11, 2024
73aab25
use auto cast
abhilash1910 Mar 11, 2024
f97e665
format
abhilash1910 Mar 11, 2024
b90f7d9
rename function
abhilash1910 Mar 11, 2024
fc0ce87
format
abhilash1910 Mar 11, 2024
c788856
Merge branch 'oneapi-src:SYCLomatic' into block_load_store_headers
abhilash1910 Mar 11, 2024
d4ce0b1
use uint32_t in place of auto
abhilash1910 Mar 18, 2024
29d4405
use item to get linear id
abhilash1910 Mar 18, 2024
a6a85ff
fix bug
abhilash1910 Mar 20, 2024
6ffd681
add tempstorage for load
abhilash1910 Mar 21, 2024
ee45991
fix bug
abhilash1910 Mar 21, 2024
7c8111d
simplify logic
abhilash1910 Mar 21, 2024
a406b15
fix id selection methods
abhilash1910 Mar 21, 2024
c4e125c
Merge branch 'oneapi-src:SYCLomatic' into block_load_store_headers
abhilash1910 Mar 22, 2024
c3bc942
rm local_memory unused
abhilash1910 Mar 25, 2024
69dbddc
clang format
abhilash1910 Mar 25, 2024
49f5d85
update based on discussion
abhilash1910 Mar 27, 2024
b1d8d70
format
abhilash1910 Mar 27, 2024
7cefbf8
update variable case
abhilash1910 Mar 28, 2024
fdc2f2f
use size_t
abhilash1910 Apr 10, 2024
95db67f
fix issues related to tests 619
abhilash1910 Apr 30, 2024
141ace7
remove ALGORITHM parameter
abhilash1910 Apr 30, 2024
540db29
format fix
abhilash1910 May 1, 2024
ebf6237
use local_range
abhilash1910 May 2, 2024
7f9d4e6
add comments
abhilash1910 May 6, 2024
cb87b67
format
abhilash1910 May 6, 2024
71d4047
format
abhilash1910 May 6, 2024
bd24713
Update clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h
abhilash1910 May 7, 2024
26a3ae2
Update clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h
abhilash1910 May 7, 2024
6222566
Update clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h
abhilash1910 May 7, 2024
236c7ba
Merge branch 'oneapi-src:SYCLomatic' into block_load_store_headers
abhilash1910 May 7, 2024
89cf7d3
format
abhilash1910 May 7, 2024
c4f0ca0
fix issue in referencing
abhilash1910 May 8, 2024
ec4f8ae
Merge branch 'oneapi-src:SYCLomatic' into block_load_store_headers
abhilash1910 May 8, 2024
d382f60
Update dpcpp_extensions.h
zhimingwang36 May 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,96 @@ class radix_sort {
uint8_t *_local_memory;
};

/// Load linear segment items into block format across threads
/// Helper for Block Load
enum load_algorithm {

abhilash1910 marked this conversation as resolved.
Show resolved Hide resolved
BLOCK_LOAD_DIRECT,
BLOCK_LOAD_STRIPED,
// To-do: BLOCK_LOAD_WARP_TRANSPOSE

};

// loads a linear segment of workgroup items into a blocked arrangement.
template <size_t ITEMS_PER_WORK_ITEM, load_algorithm ALGORITHM, typename InputT,
typename InputIteratorT, typename Item>
__dpct_inline__ void load_blocked(const Item &item, InputIteratorT block_itr,
InputT (&items)[ITEMS_PER_WORK_ITEM]) {
danhoeflinger marked this conversation as resolved.
Show resolved Hide resolved

// This implementation does not take in account range loading across
// workgroup items To-do: Decide whether range loading is required for group
// loading
size_t linear_tid = item.get_local_linear_id();
uint32_t workgroup_offset = linear_tid * ITEMS_PER_WORK_ITEM;
#pragma unroll
for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) {
items[idx] = block_itr[workgroup_offset + idx];
}
}

// loads a linear segment of workgroup items into a striped arrangement.
template <size_t ITEMS_PER_WORK_ITEM, load_algorithm ALGORITHM, typename InputT,
typename InputIteratorT, typename Item>
__dpct_inline__ void load_striped(const Item &item, InputIteratorT block_itr,
InputT (&items)[ITEMS_PER_WORK_ITEM]) {

// This implementation does not take in account range loading across
// workgroup items To-do: Decide whether range loading is required for group
// loading
size_t linear_tid = item.get_local_linear_id();
size_t group_work_items = item.get_global_range();
#pragma unroll
for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) {
items[idx] = block_itr[linear_tid + (idx * group_work_items)];
}
}

// loads a linear segment of workgroup items into a subgroup striped
// arrangement. Created as free function until exchange mechanism is
// implemented.
// To-do: inline this function with BLOCK_LOAD_WARP_TRANSPOSE mechanism
template <size_t ITEMS_PER_WORK_ITEM, typename InputT, typename InputIteratorT,
typename Item>
__dpct_inline__ void
uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr,
InputT (&items)[ITEMS_PER_WORK_ITEM]) {

// This implementation does not take in account range loading across
// workgroup items To-do: Decide whether range loading is required for group
// loading
// This implementation uses unintialized memory for loading linear segments
// into warp striped arrangement.
uint32_t subgroup_offset = item.get_sub_group().get_local_linear_id();
uint32_t subgroup_size = item.get_sub_group().get_local_linear_range();
uint32_t subgroup_idx = item.get_sub_group().get_group_linear_id();
uint32_t initial_offset =
(subgroup_idx * ITEMS_PER_WORK_ITEM * subgroup_size) + subgroup_offset;
#pragma unroll
for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) {
new (&items[idx]) InputT(block_itr[initial_offset + (idx * subgroup_size)]);
}
}

template <size_t ITEMS_PER_WORK_ITEM, load_algorithm ALGORITHM, typename InputT,
typename InputIteratorT, typename Item, typename T>
class workgroup_load {
static size_t get_local_memory_size(size_t group_work_items) { return 0; }
workgroup_load(uint8_t *local_memory) : _local_memory(local_memory) {}

__dpct_inline__ void load(const Item &item, InputIteratorT block_itr,
InputT (&items)[ITEMS_PER_WORK_ITEM]) {

if constexpr (ALGORITHM == BLOCK_LOAD_DIRECT) {
load_blocked(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]);
} else if constexpr (ALGORITHM == BLOCK_LOAD_STRIPED) {
load_striped(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]);
}
}

private:
uint8_t *_local_memory;
danhoeflinger marked this conversation as resolved.
Show resolved Hide resolved
};

/// Perform a reduction of the data elements assigned to all threads in the
/// group.
///
Expand Down
Loading