Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL] Fixed-size groups and partitions are renamed to "chunks" #16151

Draft
wants to merge 9 commits into
base: sycl
Choose a base branch
from
14 changes: 7 additions & 7 deletions llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,9 @@ def AspectExt_oneapi_bindless_images_sample_1d_usm : Aspect<"ext_oneapi_bindless
def AspectExt_oneapi_bindless_images_sample_2d_usm : Aspect<"ext_oneapi_bindless_images_sample_2d_usm">;
def AspectExt_intel_esimd : Aspect<"ext_intel_esimd">;
def AspectExt_oneapi_ballot_group : Aspect<"ext_oneapi_ballot_group">;
def AspectExt_oneapi_fixed_size_group : Aspect<"ext_oneapi_fixed_size_group">;
def AspectExt_oneapi_chunk : Aspect<"ext_oneapi_chunk">;
def AspectExt_oneapi_opportunistic_group : Aspect<"ext_oneapi_opportunistic_group">;
def AspectExt_oneapi_tangle_group : Aspect<"ext_oneapi_tangle_group">;
def AspectExt_oneapi_tangle : Aspect<"ext_oneapi_tangle">;
def AspectExt_intel_matrix : Aspect<"ext_intel_matrix">;
def AspectExt_oneapi_is_composite : Aspect<"ext_oneapi_is_composite">;
def AspectExt_oneapi_is_component : Aspect<"ext_oneapi_is_component">;
Expand Down Expand Up @@ -144,8 +144,8 @@ def : TargetInfo<"__TestAspectList",
AspectExt_oneapi_bindless_sampled_image_fetch_2d_usm, AspectExt_oneapi_bindless_sampled_image_fetch_2d,
AspectExt_oneapi_bindless_sampled_image_fetch_3d,
AspectExt_intel_esimd,
AspectExt_oneapi_ballot_group, AspectExt_oneapi_fixed_size_group, AspectExt_oneapi_opportunistic_group,
AspectExt_oneapi_tangle_group, AspectExt_intel_matrix, AspectExt_oneapi_is_composite, AspectExt_oneapi_is_component,
AspectExt_oneapi_ballot_group, AspectExt_oneapi_chunk, AspectExt_oneapi_opportunistic_group,
AspectExt_oneapi_tangle, AspectExt_intel_matrix, AspectExt_oneapi_is_composite, AspectExt_oneapi_is_component,
AspectExt_oneapi_graph, AspectExt_oneapi_limited_graph, AspectExt_oneapi_private_alloca,
AspectExt_oneapi_queue_profiling_tag, AspectExt_oneapi_virtual_mem, AspectExt_oneapi_cuda_cluster_group,
AspectExt_intel_fpga_task_sequence,
Expand All @@ -163,8 +163,8 @@ defvar IntelCpuAspects = [
AspectCpu, AspectFp16, AspectFp64, AspectQueue_profiling, AspectAtomic64,
AspectExt_oneapi_srgb, AspectExt_oneapi_native_assert,
AspectExt_intel_legacy_image, AspectExt_oneapi_ballot_group,
AspectExt_oneapi_fixed_size_group, AspectExt_oneapi_opportunistic_group,
AspectExt_oneapi_tangle_group, AspectExt_oneapi_private_alloca
AspectExt_oneapi_chunk, AspectExt_oneapi_opportunistic_group,
AspectExt_oneapi_tangle, AspectExt_oneapi_private_alloca
] # AllUSMAspects;

def : TargetInfo<"spir64", [], [], "", "", 1>;
Expand Down Expand Up @@ -231,7 +231,7 @@ class CudaTargetInfo<string targetName, list<Aspect> aspectList, int subGroupSiz
defvar CudaMinAspects = !listconcat(AllUSMAspects, [AspectGpu, AspectFp64, AspectOnline_compiler, AspectOnline_linker,
AspectQueue_profiling, AspectExt_intel_pci_address, AspectExt_intel_max_mem_bandwidth, AspectExt_intel_memory_bus_width,
AspectExt_intel_device_info_uuid, AspectExt_oneapi_native_assert, AspectExt_intel_free_memory, AspectExt_intel_device_id,
AspectExt_intel_memory_clock_rate, AspectExt_oneapi_ballot_group, AspectExt_oneapi_fixed_size_group,
AspectExt_intel_memory_clock_rate, AspectExt_oneapi_ballot_group, AspectExt_oneapi_chunk,
AspectExt_oneapi_opportunistic_group, AspectExt_oneapi_graph, AspectExt_oneapi_limited_graph]);
// Bindless images aspects are partially supported on CUDA and disabled by default at the moment.
defvar CudaBindlessImagesAspects = [AspectExt_oneapi_bindless_images, AspectExt_oneapi_bindless_images_shared_usm,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ branches to safely communicate between all work-items executing the same
control flow.

NOTE: This differs from the `fragment` returned by `get_opportunistic_group()`
because a `tangle_group` requires the implementation to track group membership.
because a `tangle` requires the implementation to track group membership.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feel free to submit this small change as a separate PR, we should be able to merge it almost immediately

Which group type to use will depend on a combination of
implementation/backend/device and programmer preference.

Expand Down
79 changes: 37 additions & 42 deletions sycl/include/sycl/detail/spirv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ namespace oneapi {
struct sub_group;
namespace experimental {
template <typename ParentGroup> class ballot_group;
template <size_t PartitionSize, typename ParentGroup> class fixed_size_group;
template <size_t ChunkSize, typename ParentGroup> class chunk;
template <int Dimensions> class root_group;
template <typename ParentGroup> class tangle_group;
template <typename ParentGroup> class tangle;
class opportunistic_group;
} // namespace experimental
} // namespace oneapi
Expand Down Expand Up @@ -62,8 +62,7 @@ struct is_tangle_or_opportunistic_group : std::false_type {};

template <typename ParentGroup>
struct is_tangle_or_opportunistic_group<
sycl::ext::oneapi::experimental::tangle_group<ParentGroup>>
: std::true_type {};
sycl::ext::oneapi::experimental::tangle<ParentGroup>> : std::true_type {};

template <>
struct is_tangle_or_opportunistic_group<
Expand All @@ -76,11 +75,11 @@ struct is_ballot_group<
sycl::ext::oneapi::experimental::ballot_group<ParentGroup>>
: std::true_type {};

template <typename Group> struct is_fixed_size_group : std::false_type {};
template <typename Group> struct is_chunk : std::false_type {};

template <size_t PartitionSize, typename ParentGroup>
struct is_fixed_size_group<sycl::ext::oneapi::experimental::fixed_size_group<
PartitionSize, ParentGroup>> : std::true_type {};
template <size_t ChunkSize, typename ParentGroup>
struct is_chunk<sycl::ext::oneapi::experimental::chunk<ChunkSize, ParentGroup>>
: std::true_type {};

template <typename Group> struct group_scope {};

Expand All @@ -105,14 +104,14 @@ struct group_scope<sycl::ext::oneapi::experimental::ballot_group<ParentGroup>> {
static constexpr __spv::Scope::Flag value = group_scope<ParentGroup>::value;
};

template <size_t PartitionSize, typename ParentGroup>
struct group_scope<sycl::ext::oneapi::experimental::fixed_size_group<
PartitionSize, ParentGroup>> {
template <size_t ChunkSize, typename ParentGroup>
struct group_scope<
sycl::ext::oneapi::experimental::chunk<ChunkSize, ParentGroup>> {
static constexpr __spv::Scope::Flag value = group_scope<ParentGroup>::value;
};

template <typename ParentGroup>
struct group_scope<sycl::ext::oneapi::experimental::tangle_group<ParentGroup>> {
struct group_scope<sycl::ext::oneapi::experimental::tangle<ParentGroup>> {
static constexpr __spv::Scope::Flag value = group_scope<ParentGroup>::value;
};

Expand Down Expand Up @@ -174,18 +173,17 @@ bool GroupAll(ext::oneapi::experimental::ballot_group<ParentGroup> g,
return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
}
}
template <size_t PartitionSize, typename ParentGroup>
bool GroupAll(
ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup>,
bool pred) {
template <size_t ChunkSize, typename ParentGroup>
bool GroupAll(ext::oneapi::experimental::chunk<ChunkSize, ParentGroup>,
bool pred) {
// GroupNonUniformAll doesn't support cluster size, so use a reduction
return __spirv_GroupNonUniformBitwiseAnd(
group_scope<ParentGroup>::value,
static_cast<uint32_t>(__spv::GroupOperation::ClusteredReduce),
static_cast<uint32_t>(pred), PartitionSize);
static_cast<uint32_t>(pred), ChunkSize);
}
template <typename ParentGroup>
bool GroupAll(ext::oneapi::experimental::tangle_group<ParentGroup>, bool pred) {
bool GroupAll(ext::oneapi::experimental::tangle<ParentGroup>, bool pred) {
return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
}

Expand All @@ -210,18 +208,17 @@ bool GroupAny(ext::oneapi::experimental::ballot_group<ParentGroup> g,
return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
}
}
template <size_t PartitionSize, typename ParentGroup>
bool GroupAny(
ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup>,
bool pred) {
template <size_t ChunkSize, typename ParentGroup>
bool GroupAny(ext::oneapi::experimental::chunk<ChunkSize, ParentGroup>,
bool pred) {
// GroupNonUniformAny doesn't support cluster size, so use a reduction
return __spirv_GroupNonUniformBitwiseOr(
group_scope<ParentGroup>::value,
static_cast<uint32_t>(__spv::GroupOperation::ClusteredReduce),
static_cast<uint32_t>(pred), PartitionSize);
static_cast<uint32_t>(pred), ChunkSize);
}
template <typename ParentGroup>
bool GroupAny(ext::oneapi::experimental::tangle_group<ParentGroup>, bool pred) {
bool GroupAny(ext::oneapi::experimental::tangle<ParentGroup>, bool pred) {
return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
}
bool GroupAny(const ext::oneapi::experimental::opportunistic_group &,
Expand Down Expand Up @@ -327,12 +324,12 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group<ParentGroup> g,
WideOCLX, OCLId);
}
}
template <size_t PartitionSize, typename ParentGroup, typename T, typename IdT>
EnableIfNativeBroadcast<T, IdT> GroupBroadcast(
ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup> g,
T x, IdT local_id) {
template <size_t ChunkSize, typename ParentGroup, typename T, typename IdT>
EnableIfNativeBroadcast<T, IdT>
GroupBroadcast(ext::oneapi::experimental::chunk<ChunkSize, ParentGroup> g, T x,
IdT local_id) {
// Remap local_id to its original numbering in ParentGroup
auto LocalId = g.get_group_linear_id() * PartitionSize + local_id;
auto LocalId = g.get_group_linear_id() * ChunkSize + local_id;

// TODO: Refactor to avoid duplication after design settles.
auto GroupLocalId = static_cast<typename GroupId<ParentGroup>::type>(LocalId);
Expand All @@ -341,15 +338,15 @@ EnableIfNativeBroadcast<T, IdT> GroupBroadcast(
auto OCLId = detail::convertToOpenCLType(GroupLocalId);

// NonUniformBroadcast requires Id to be dynamically uniform, which does not
// hold here; each partition is broadcasting a separate index. We could
// hold here; each chunk is broadcasting a separate index. We could
// fallback to either NonUniformShuffle or a NonUniformBroadcast per
// partition, and it's unclear which will be faster in practice.
// chunk, and it's unclear which will be faster in practice.
return __spirv_GroupNonUniformShuffle(group_scope<ParentGroup>::value,
WideOCLX, OCLId);
}
template <typename ParentGroup, typename T, typename IdT>
EnableIfNativeBroadcast<T, IdT>
GroupBroadcast(ext::oneapi::experimental::tangle_group<ParentGroup> g, T x,
GroupBroadcast(ext::oneapi::experimental::tangle<ParentGroup> g, T x,
IdT local_id) {
// Remap local_id to its original numbering in ParentGroup.
auto LocalId = detail::IdToMaskPosition(g, local_id);
Expand Down Expand Up @@ -888,7 +885,7 @@ inline uint32_t MapShuffleID(GroupT g, id<1> local_id) {
if constexpr (is_tangle_or_opportunistic_group<GroupT>::value ||
is_ballot_group<GroupT>::value)
return detail::IdToMaskPosition(g, local_id);
else if constexpr (is_fixed_size_group<GroupT>::value)
else if constexpr (is_chunk<GroupT>::value)
return g.get_group_linear_id() * g.get_local_range().size() + local_id;
else
return local_id.get(0);
Expand Down Expand Up @@ -983,7 +980,7 @@ EnableIfNativeShuffle<T> ShuffleXor(GroupT g, T x, id<1> mask) {
if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<
GroupT>) {
auto MemberMask = detail::ExtractMask(detail::GetMask(g))[0];
if constexpr (is_fixed_size_group_v<GroupT>) {
if constexpr (is_chunk_v<GroupT>) {
return cuda_shfl_sync_bfly_i32(MemberMask, x,
static_cast<uint32_t>(mask.get(0)), 0x1f);

Expand Down Expand Up @@ -1031,7 +1028,7 @@ EnableIfNativeShuffle<T> ShuffleDown(GroupT g, T x, uint32_t delta) {
if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<
GroupT>) {
auto MemberMask = detail::ExtractMask(detail::GetMask(g))[0];
if constexpr (is_fixed_size_group_v<GroupT>) {
if constexpr (is_chunk_v<GroupT>) {
return cuda_shfl_sync_down_i32(MemberMask, x, delta, 31);
} else {
unsigned localSetBit = g.get_local_id()[0] + 1;
Expand Down Expand Up @@ -1075,7 +1072,7 @@ EnableIfNativeShuffle<T> ShuffleUp(GroupT g, T x, uint32_t delta) {
if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<
GroupT>) {
auto MemberMask = detail::ExtractMask(detail::GetMask(g))[0];
if constexpr (is_fixed_size_group_v<GroupT>) {
if constexpr (is_chunk_v<GroupT>) {
return cuda_shfl_sync_up_i32(MemberMask, x, delta, 0);
} else {
unsigned localSetBit = g.get_local_id()[0] + 1;
Expand Down Expand Up @@ -1298,12 +1295,10 @@ ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) {
} \
} \
\
template <__spv::GroupOperation Op, size_t PartitionSize, \
typename ParentGroup, typename T> \
template <__spv::GroupOperation Op, size_t ChunkSize, typename ParentGroup, \
typename T> \
inline T Group##Instruction( \
ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup> \
g, \
T x) { \
ext::oneapi::experimental::chunk<ChunkSize, ParentGroup> g, T x) { \
using ConvertedT = detail::ConvertToOpenCLType_t<T>; \
\
using OCLT = std::conditional_t< \
Expand All @@ -1321,7 +1316,7 @@ ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) {
constexpr auto OpInt = \
static_cast<unsigned int>(__spv::GroupOperation::ClusteredReduce); \
return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg, \
PartitionSize); \
ChunkSize); \
} else { \
T tmp; \
for (size_t Cluster = 0; Cluster < g.get_group_linear_range(); \
Expand Down
7 changes: 3 additions & 4 deletions sycl/include/sycl/detail/type_traits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,9 @@
namespace sycl {
inline namespace _V1 {
namespace detail {
template <class T> struct is_fixed_size_group : std::false_type {};
template <class T> struct is_chunk : std::false_type {};

template <class T>
inline constexpr bool is_fixed_size_group_v = is_fixed_size_group<T>::value;
template <class T> inline constexpr bool is_chunk_v = is_chunk<T>::value;

template <typename VecT, typename OperationLeftT, typename OperationRightT,
template <typename> class OperationCurrentT, int... Indexes>
Expand Down Expand Up @@ -157,7 +156,7 @@ template <typename T, int N> struct get_elem_type_unqual<vec<T, N>> {
template <typename VecT, typename OperationLeftT, typename OperationRightT,
template <typename> class OperationCurrentT, int... Indexes>
struct get_elem_type_unqual<SwizzleOp<VecT, OperationLeftT, OperationRightT,
OperationCurrentT, Indexes...>> {
OperationCurrentT, Indexes...>> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Strictly speaking, there should be no unrelated formatting changes, but in this particular case it is more of a nitpicking and can be ignored.

Copy link
Contributor Author

@AndreiZibrov AndreiZibrov Nov 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the formatting task failed until I called clang-format -i per changed file

upd:
not-strictly:

cd ../llvm &&
git diff --cached --name-only --diff-filter=ACM | grep '\.[ch]pp$'|xargs -I{} clang-format -i```

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Topic will be resolved with next commit.

The clang-format tool could be used for essential code as Alexei Sachkov has shared

Previously used way is patching only the changed files but not the file regions

export PATH="$(pwd)/build/bin:$PATH"
git diff -U0 --no-color --relative HEAD^ | $(find -name clang-format-diff.py) -p1 -i

using type = typename get_elem_type_unqual<std::remove_cv_t<VecT>>::type;
};

Expand Down
Loading