-
Notifications
You must be signed in to change notification settings - Fork 787
Enhance querying kernels preferred wgsize #16186
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: sycl
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -144,8 +144,8 @@ __SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr<queue_impl> Queue, | |||||
size_t LocalMemBytesPerWorkItem); | ||||||
__SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize, | ||||||
size_t &NWorkGroups); | ||||||
__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue, | ||||||
size_t LocalMemBytesPerWorkItem); | ||||||
__SYCL_EXPORT size_t reduGetPreferredDeviceWGSize( | ||||||
std::shared_ptr<queue_impl> &Queue, size_t LocalMemBytesPerWorkItem); | ||||||
|
||||||
template <typename T, class BinaryOperation, bool IsOptional> | ||||||
class ReducerElement; | ||||||
|
@@ -1200,6 +1200,25 @@ void reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu) { | |||||
}); | ||||||
} | ||||||
|
||||||
template <typename KernelName> | ||||||
size_t reduGetPreferredKernelWGSize(std::shared_ptr<queue_impl> &Queue) { | ||||||
using namespace info::kernel_device_specific; | ||||||
auto SyclQueue = createSyclObjFromImpl<queue>(Queue); | ||||||
auto Ctx = SyclQueue.get_context(); | ||||||
auto Dev = SyclQueue.get_device(); | ||||||
size_t MaxWGSize = SIZE_MAX; | ||||||
constexpr bool IsUndefinedKernelName{std::is_same_v<KernelName, auto_name>}; | ||||||
|
||||||
if (!IsUndefinedKernelName) { | ||||||
auto ExecBundle = | ||||||
get_kernel_bundle<KernelName, bundle_state::executable>(Ctx, {Dev}); | ||||||
kernel Kernel = ExecBundle.template get_kernel<KernelName>(); | ||||||
MaxWGSize = Kernel.template get_info<work_group_size>(Dev); | ||||||
} | ||||||
|
||||||
return MaxWGSize; | ||||||
} | ||||||
|
||||||
namespace reduction { | ||||||
template <typename KernelName, strategy S, class... Ts> struct MainKrn; | ||||||
template <typename KernelName, strategy S, class... Ts> struct AuxKrn; | ||||||
|
@@ -1302,6 +1321,8 @@ struct NDRangeReduction< | |||||
reduction::strategy::group_reduce_and_last_wg_detection, | ||||||
decltype(NWorkGroupsFinished)>; | ||||||
|
||||||
WGSize = std::min(WGSize, reduGetPreferredKernelWGSize<Name>(Queue)); | ||||||
|
||||||
CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<1> NDId) { | ||||||
// Call user's functions. Reducer.MValue gets initialized there. | ||||||
typename Reduction::reducer_type Reducer; | ||||||
|
@@ -1515,6 +1536,8 @@ template <> struct NDRangeReduction<reduction::strategy::range_basic> { | |||||
using Name = __sycl_reduction_kernel<reduction::MainKrn, KernelName, | ||||||
reduction::strategy::range_basic>; | ||||||
|
||||||
WGSize = std::min(WGSize, reduGetPreferredKernelWGSize<Name>(Queue)); | ||||||
|
||||||
CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<1> NDId) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we are recalculating |
||||||
// Call user's functions. Reducer.MValue gets initialized there. | ||||||
reducer_type Reducer = reducer_type(IdentityContainer, BOp); | ||||||
|
@@ -1628,14 +1651,14 @@ struct NDRangeReduction< | |||||
using reducer_type = typename Reduction::reducer_type; | ||||||
using element_type = typename ReducerTraits<reducer_type>::element_type; | ||||||
|
||||||
std::ignore = Queue; | ||||||
using Name = __sycl_reduction_kernel< | ||||||
reduction::MainKrn, KernelName, | ||||||
reduction::strategy::local_mem_tree_and_atomic_cross_wg>; | ||||||
Redu.template withInitializedMem<Name>(CGH, [&](auto Out) { | ||||||
size_t NElements = Reduction::num_elements; | ||||||
size_t WGSize = NDRange.get_local_range().size(); | ||||||
|
||||||
WGSize = std::min(WGSize, reduGetPreferredKernelWGSize<Name>(Queue)); | ||||||
// Use local memory to reduce elements in work-groups into zero-th | ||||||
// element. | ||||||
local_accessor<element_type, 1> LocalReds{WGSize, CGH}; | ||||||
|
@@ -1722,6 +1745,8 @@ struct NDRangeReduction< | |||||
reduction::MainKrn, KernelName, | ||||||
reduction::strategy::group_reduce_and_multiple_kernels>; | ||||||
|
||||||
MaxWGSize = std::min(MaxWGSize, reduGetPreferredKernelWGSize<Name>(Queue)); | ||||||
|
||||||
CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<Dims> NDIt) { | ||||||
// Call user's functions. Reducer.MValue gets initialized there. | ||||||
typename Reduction::reducer_type Reducer; | ||||||
|
@@ -1781,6 +1806,8 @@ struct NDRangeReduction< | |||||
reduction::AuxKrn, KernelName, | ||||||
reduction::strategy::group_reduce_and_multiple_kernels>; | ||||||
|
||||||
WGSize = std::min(WGSize, reduGetPreferredKernelWGSize<Name>(Queue)); | ||||||
|
||||||
bool IsUpdateOfUserVar = !Reduction::is_usm && | ||||||
!Redu.initializeToIdentity() && | ||||||
NWorkGroups == 1; | ||||||
|
@@ -1874,6 +1901,9 @@ template <> struct NDRangeReduction<reduction::strategy::basic> { | |||||
reduction::strategy::basic, | ||||||
decltype(KernelTag)>; | ||||||
|
||||||
MaxWGSize = | ||||||
std::min(MaxWGSize, reduGetPreferredKernelWGSize<Name>(Queue)); | ||||||
|
||||||
CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<Dims> NDIt) { | ||||||
// Call user's functions. Reducer.MValue gets initialized there. | ||||||
typename Reduction::reducer_type Reducer = | ||||||
|
@@ -1978,6 +2008,8 @@ template <> struct NDRangeReduction<reduction::strategy::basic> { | |||||
reduction::strategy::basic, | ||||||
decltype(KernelTag)>; | ||||||
|
||||||
WGSize = std::min(WGSize, reduGetPreferredKernelWGSize<Name>(Queue)); | ||||||
|
||||||
range<1> GlobalRange = {UniformPow2WG ? NWorkItems | ||||||
: NWorkGroups * WGSize}; | ||||||
nd_range<1> Range{GlobalRange, range<1>(WGSize)}; | ||||||
|
@@ -2295,8 +2327,9 @@ template <class KernelName, class Accessor> struct NDRangeMulti; | |||||
} // namespace reduction::main_krn | ||||||
template <typename KernelName, typename KernelType, int Dims, | ||||||
typename PropertiesT, typename... Reductions, size_t... Is> | ||||||
void reduCGFuncMulti(handler &CGH, KernelType KernelFunc, | ||||||
const nd_range<Dims> &Range, PropertiesT Properties, | ||||||
void reduCGFuncMulti(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue, | ||||||
KernelType KernelFunc, const nd_range<Dims> &Range, | ||||||
PropertiesT Properties, | ||||||
std::tuple<Reductions...> &ReduTuple, | ||||||
std::index_sequence<Is...> ReduIndices) { | ||||||
size_t WGSize = Range.get_local_range().size(); | ||||||
|
@@ -2334,6 +2367,8 @@ void reduCGFuncMulti(handler &CGH, KernelType KernelFunc, | |||||
reduction::strategy::multi, | ||||||
decltype(KernelTag)>; | ||||||
|
||||||
WGSize = std::min(WGSize, reduGetPreferredKernelWGSize<Name>(Queue)); | ||||||
|
||||||
CGH.parallel_for<Name>(Range, Properties, [=](nd_item<Dims> NDIt) { | ||||||
// We can deduce IsOneWG from the tag type. | ||||||
constexpr bool IsOneWG = | ||||||
|
@@ -2495,7 +2530,8 @@ template <class KernelName, class Predicate> struct Multi; | |||||
} // namespace reduction::aux_krn | ||||||
template <typename KernelName, typename KernelType, typename... Reductions, | ||||||
size_t... Is> | ||||||
size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, | ||||||
size_t reduAuxCGFunc(handler &CGH, std::shared_ptr<queue_impl> &Queue, | ||||||
size_t NWorkItems, size_t MaxWGSize, | ||||||
std::tuple<Reductions...> &ReduTuple, | ||||||
std::index_sequence<Is...> ReduIndices) { | ||||||
size_t NWorkGroups; | ||||||
|
@@ -2533,6 +2569,8 @@ size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, | |||||
using Name = __sycl_reduction_kernel<reduction::AuxKrn, KernelName, | ||||||
reduction::strategy::multi, | ||||||
decltype(Predicate)>; | ||||||
WGSize = std::min(WGSize, reduGetPreferredKernelWGSize<Name>(Queue)); | ||||||
|
||||||
// TODO: Opportunity to parallelize across number of elements | ||||||
range<1> GlobalRange = {HasUniformWG ? NWorkItems : NWorkGroups * WGSize}; | ||||||
nd_range<1> Range{GlobalRange, range<1>(WGSize)}; | ||||||
|
@@ -2617,15 +2655,15 @@ template <> struct NDRangeReduction<reduction::strategy::multi> { | |||||
" than " + | ||||||
std::to_string(MaxWGSize)); | ||||||
|
||||||
reduCGFuncMulti<KernelName>(CGH, KernelFunc, NDRange, Properties, ReduTuple, | ||||||
ReduIndices); | ||||||
reduCGFuncMulti<KernelName>(CGH, Queue, KernelFunc, NDRange, Properties, | ||||||
ReduTuple, ReduIndices); | ||||||
reduction::finalizeHandler(CGH); | ||||||
|
||||||
size_t NWorkItems = NDRange.get_group_range().size(); | ||||||
while (NWorkItems > 1) { | ||||||
reduction::withAuxHandler(CGH, [&](handler &AuxHandler) { | ||||||
NWorkItems = reduAuxCGFunc<KernelName, decltype(KernelFunc)>( | ||||||
AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices); | ||||||
AuxHandler, Queue, NWorkItems, MaxWGSize, ReduTuple, ReduIndices); | ||||||
}); | ||||||
} // end while (NWorkItems > 1) | ||||||
} | ||||||
|
@@ -2741,7 +2779,29 @@ void reduction_parallel_for(handler &CGH, range<Dims> Range, | |||||
// TODO: currently the preferred work group size is determined for the given | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this TODO be updated based on the changes in this PR? |
||||||
// queue/device, while it is safer to use queries to the kernel pre-compiled | ||||||
// for the device. | ||||||
size_t PrefWGSize = reduGetPreferredWGSize(CGH.MQueue, OneElemSize); | ||||||
size_t PrefWGSize = reduGetPreferredDeviceWGSize(CGH.MQueue, OneElemSize); | ||||||
|
||||||
auto SyclQueue = createSyclObjFromImpl<queue>(CGH.MQueue); | ||||||
auto Ctx = SyclQueue.get_context(); | ||||||
auto Dev = SyclQueue.get_device(); | ||||||
|
||||||
// If the reduction kernel is not name defined, we won't be able to query the | ||||||
// exact kernel for the best wgsize, so we query all the reduction kernels for | ||||||
// thier wgsize and use the minimum wgsize as a safe and approximate option. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
constexpr bool IsUndefinedKernelName{std::is_same_v<KernelName, auto_name>}; | ||||||
if (IsUndefinedKernelName) { | ||||||
std::vector<kernel_id> ReductionKernelIDs = get_kernel_ids(); | ||||||
for (auto KernelID : ReductionKernelIDs) { | ||||||
std::string ReduKernelName = KernelID.get_name(); | ||||||
if (ReduKernelName.find("reduction") != std::string::npos) { | ||||||
auto KB = get_kernel_bundle<bundle_state::executable>(Ctx, {KernelID}); | ||||||
kernel krn = KB.get_kernel(KernelID); | ||||||
using namespace info::kernel_device_specific; | ||||||
size_t MaxSize = krn.template get_info<work_group_size>(Dev); | ||||||
PrefWGSize = std::min(PrefWGSize, MaxSize); | ||||||
} | ||||||
} | ||||||
} | ||||||
|
||||||
size_t NWorkItems = Range.size(); | ||||||
size_t WGSize = std::min(NWorkItems, PrefWGSize); | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3294,7 +3294,7 @@ _ZN4sycl3_V16detail22get_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6devi | |
_ZN4sycl3_V16detail22get_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EERKS5_INS0_9kernel_idESaISB_EENS0_12bundle_stateE | ||
_ZN4sycl3_V16detail22has_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EENS0_12bundle_stateE | ||
_ZN4sycl3_V16detail22has_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EERKS5_INS0_9kernel_idESaISB_EENS0_12bundle_stateE | ||
_ZN4sycl3_V16detail22reduGetPreferredWGSizeERSt10shared_ptrINS1_10queue_implEEm | ||
_ZN4sycl3_V16detail28reduGetPreferredDeviceWGSizeERSt10shared_ptrINS1_10queue_implEEm | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ABI breaking changes? If so, we need to put them under the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think there is little to no need to rename |
||
_ZN4sycl3_V16detail22removeDuplicateDevicesERKSt6vectorINS0_6deviceESaIS3_EE | ||
_ZN4sycl3_V16detail23constructorNotificationEPvS2_NS0_6access6targetENS3_4modeERKNS1_13code_locationE | ||
_ZN4sycl3_V16detail24find_device_intersectionERKSt6vectorINS0_13kernel_bundleILNS0_12bundle_stateE1EEESaIS5_EE | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Similarly to
reduGetPreferredWGSize
in reduction.cpp, I think this function should probably also respect theSYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE
environment variable value fromSYCLConfig
.