Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCLomatic] Block Store headers core #1819

Closed
wants to merge 27 commits into from
Closed
Changes from 8 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
13d8b67
block store
abhilash1910 Mar 25, 2024
7517519
fix bug
abhilash1910 Mar 25, 2024
6b7fd09
update code
abhilash1910 May 10, 2024
454c453
fix template param
abhilash1910 May 10, 2024
9e75c62
Merge branch 'SYCLomatic' into block_store
abhilash1910 May 10, 2024
ffbd181
fix error
abhilash1910 May 14, 2024
a0007e1
Merge branch 'SYCLomatic' into block_store
abhilash1910 May 30, 2024
49147b8
add in group_utils
abhilash1910 May 30, 2024
18f826a
use class
abhilash1910 May 30, 2024
7149372
review commit
abhilash1910 May 30, 2024
431d4a4
format
abhilash1910 May 30, 2024
8cc73f1
review commit
abhilash1910 Jun 6, 2024
a677eb2
Merge branch 'oneapi-src:SYCLomatic' into block_store
abhilash1910 Jul 4, 2024
98d0193
clang-format
abhilash1910 Jul 4, 2024
79295f8
Merge branch 'oneapi-src:SYCLomatic' into block_store
abhilash1910 Jul 10, 2024
c4fe035
reorder template args for better visibility in parsing
abhilash1910 Jul 11, 2024
76ec684
revert template alignment
abhilash1910 Aug 12, 2024
41b1c8a
fix temps pointer
abhilash1910 Aug 12, 2024
b046dcc
rectify comment
abhilash1910 Aug 21, 2024
f86801d
Merge branch 'SYCLomatic' into block_store
abhilash1910 Aug 22, 2024
273d098
Update clang/runtime/dpct-rt/include/dpct/group_utils.hpp
abhilash1910 Aug 22, 2024
3185ceb
Update group_utils.hpp
abhilash1910 Aug 22, 2024
cc00403
fix review comments
abhilash1910 Aug 22, 2024
56c07e1
Merge branch 'SYCLomatic' into block_store
abhilash1910 Aug 26, 2024
28ff868
fix
abhilash1910 Aug 26, 2024
e87c0a6
Update clang/runtime/dpct-rt/include/dpct/group_utils.hpp
abhilash1910 Aug 26, 2024
1802fbe
update correct variables
abhilash1910 Aug 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions clang/runtime/dpct-rt/include/dpct/group_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,108 @@ class workgroup_load {
private:
uint8_t *_local_memory;
};

/// Store blocked/warped or striped work items into linear segment of items.
abhilash1910 marked this conversation as resolved.
Show resolved Hide resolved
/// Helper for Block Store
enum store_algorithm {
yihanwg marked this conversation as resolved.
Show resolved Hide resolved

BLOCK_STORE_DIRECT,
BLOCK_STORE_STRIPED,
// To-do: BLOCK_STORE_WARP_TRANSPOSE
yihanwg marked this conversation as resolved.
Show resolved Hide resolved
// To-do: BLOCK_STORE_VECTORIZE

};

/// Stores a blocked arrangement of work items linear segment of items.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to have a more detail comments like

/// Computes a CSR format sparse matrix-dense matrix product.
/// C = alpha * op(A) * B + beta * C
/// \param [in] queue The queue where the routine should be executed. It must
/// have the in_order property when using the USM mode.
/// \param [in] trans The operation applied to the matrix A.
/// \param [in] sparse_rows Number of rows of the matrix A.
/// \param [in] dense_cols Number of columns of the matrix op(B) or C.
/// \param [in] sparse_cols Number of columns of the matrix A.
/// \param [in] alpha Scaling factor for the matrix A.
/// \param [in] info Matrix info of the matrix A.
/// \param [in] val An array containing the non-zero elements of the matrix A.
/// \param [in] row_ptr An array of length \p num_rows + 1.
/// \param [in] col_ind An array containing the column indices in index-based
/// numbering.
/// \param [in] b Data of the matrix B.
/// \param [in] ldb Leading dimension of the matrix B.
/// \param [in] beta Scaling factor for the matrix B.
/// \param [in, out] c Data of the matrix C.
/// \param [in] ldc Leading dimension of the matrix C.
template <typename T>
void csrmm(sycl::queue &queue, oneapi::mkl::transpose trans, int sparse_rows,
int dense_cols, int sparse_cols, const T *alpha,
const std::shared_ptr<matrix_info> info, const T *val,
const int *row_ptr, const int *col_ind, const T *b, int ldb,
const T *beta, T *c, int ldc) {
csrmm<T>(queue, trans, oneapi::mkl::transpose::nontrans, sparse_rows,
dense_cols, sparse_cols, alpha, info, val, row_ptr, col_ind, b, ldb,
beta, c, ldc);
}

template <size_t ITEMS_PER_WORK_ITEM, typename InputT,
typename OutputIteratorT, typename Item>
__dpct_inline__ void store_blocked(const Item &item, OutputIteratorT block_itr,
InputT (&items)[ITEMS_PER_WORK_ITEM]) {

// This implementation does not take in account range storage across
// workgroup items To-do: Decide whether range storage is required for group
// storage
size_t linear_tid = item.get_local_linear_id();
OutputIteratorT workitem_itr = block_itr + (linear_tid * ITEMS_PER_WORK_ITEM);
#pragma unroll
for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) {
workitem_itr[idx] = items[idx];
}
}

/// Stores a striped arrangement of work items linear segment of items.
template <size_t ITEMS_PER_WORK_ITEM, typename InputT,
typename OutputIteratorT, typename Item>
__dpct_inline__ void store_striped(const Item &item, OutputIteratorT block_itr,
InputT (&items)[ITEMS_PER_WORK_ITEM]) {

// This implementation does not take in account range storage across
// workgroup items To-do: Decide whether range storage is required for group
// storage
size_t linear_tid = item.get_local_linear_id();
OutputIteratorT workitem_itr = block_itr + linear_tid;
size_t GROUP_WORK_ITEMS = item.get_global_range().size();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

get_global_range returns the global dimensions of the kernel, so when launching more than a single work-group this will be incorrect.

We can switch this to what we did in group load: size_t group_work_items = item.get_local_range().size();

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd recommend making sure we have testing coverage for such a case as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes local range should be used, all the tests are currently in 1 wg . Will extend tests for other wg sizes in a separate PR. thanks.

#pragma unroll
for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) {
workitem_itr[(idx * GROUP_WORK_ITEMS)] = items[idx];
}
}

/// Stores a warp-striped arrangement of work items linear segment of items.
abhilash1910 marked this conversation as resolved.
Show resolved Hide resolved
// Created as free function until exchange mechanism is
// implemented.
// To-do: inline this function with BLOCK_STORE_WARP_TRANSPOSE mechanism
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure what this comment means exactly. But also, lets use our own terminology here.

template <size_t ITEMS_PER_WORK_ITEM, typename InputT, typename OutputIteratorT,
typename Item>
__dpct_inline__ void
store_subgroup_striped(const Item &item, OutputIteratorT block_itr,
InputT (&items)[ITEMS_PER_WORK_ITEM]) {

// This implementation does not take in account range loading across
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you describe what you mean by "range loading" means in this context?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Range loading/storing refers to loading/storing within bounded intervals across the warps. Not in the full scope .

// workgroup items To-do: Decide whether range loading is required for group
// loading
// This implementation uses unintialized memory for loading linear segments
abhilash1910 marked this conversation as resolved.
Show resolved Hide resolved
abhilash1910 marked this conversation as resolved.
Show resolved Hide resolved
// into warp striped arrangement.
uint32_t subgroup_offset = item.get_sub_group().get_local_linear_id();
uint32_t subgroup_size = item.get_sub_group().get_local_linear_range();
uint32_t subgroup_idx = item.get_sub_group().get_group_linear_id();
abhilash1910 marked this conversation as resolved.
Show resolved Hide resolved
uint32_t initial_offset =
(subgroup_idx * ITEMS_PER_WORK_ITEM * subgroup_size) + subgroup_offset;
OutputIteratorT workitem_itr = block_itr + initial_offset;
#pragma unroll
for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) {
workitem_itr[(idx * subgroup_size)] = items[idx];
}
}

// template parameters :
// ITEMS_PER_WORK_ITEM: size_t variable controlling the number of items per
// thread/work_item
// ALGORITHM: store_algorithm variable controlling the type of store operation.
// InputT: type for input sequence.
// OutputIteratorT: output iterator type
// Item : typename parameter resembling sycl::nd_item<3> .
template <size_t ITEMS_PER_WORK_ITEM, store_algorithm ALGORITHM, typename InputT,
typename OutputIteratorT, typename Item>
class workgroup_store {
public:
static size_t get_local_memory_size(size_t group_work_items) { return 0; }
workgroup_store(uint8_t *local_memory) : _local_memory(local_memory) {}

__dpct_inline__ void store(const Item &item, OutputIteratorT block_itr,
InputT (&items)[ITEMS_PER_WORK_ITEM]) {

if constexpr (ALGORITHM == BLOCK_STORE_DIRECT) {
store_blocked<ITEMS_PER_WORK_ITEM>(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]);
} else if constexpr (ALGORITHM == BLOCK_STORE_STRIPED) {
store_striped<ITEMS_PER_WORK_ITEM>(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]);
}
}

private:
uint8_t *_local_memory;
danhoeflinger marked this conversation as resolved.
Show resolved Hide resolved
};

} // namespace group
} // namespace dpct

Expand Down
Loading