From 71739a8cd40b5f75c19318bcf9058b990f17d0f3 Mon Sep 17 00:00:00 2001 From: omarahmed1111 Date: Thu, 28 Nov 2024 14:53:15 +0000 Subject: [PATCH] Enhance querying kernels preferred wgsize Co-authored-by: Georgi Mirazchiyski --- sycl/include/sycl/reduction.hpp | 80 +++++++++++++++++++++---- sycl/source/detail/reduction.cpp | 4 +- sycl/test/abi/sycl_symbols_linux.dump | 2 +- sycl/test/abi/sycl_symbols_windows.dump | 2 +- 4 files changed, 74 insertions(+), 14 deletions(-) diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp index f84940d81d162..d651446f28abe 100644 --- a/sycl/include/sycl/reduction.hpp +++ b/sycl/include/sycl/reduction.hpp @@ -144,8 +144,8 @@ __SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr Queue, size_t LocalMemBytesPerWorkItem); __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize, size_t &NWorkGroups); -__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr &Queue, - size_t LocalMemBytesPerWorkItem); +__SYCL_EXPORT size_t reduGetPreferredDeviceWGSize( + std::shared_ptr &Queue, size_t LocalMemBytesPerWorkItem); template class ReducerElement; @@ -1200,6 +1200,25 @@ void reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu) { }); } +template +size_t reduGetPreferredKernelWGSize(std::shared_ptr &Queue) { + using namespace info::kernel_device_specific; + auto SyclQueue = createSyclObjFromImpl(Queue); + auto Ctx = SyclQueue.get_context(); + auto Dev = SyclQueue.get_device(); + size_t MaxWGSize = SIZE_MAX; + constexpr bool IsUndefinedKernelName{std::is_same_v}; + + if (!IsUndefinedKernelName) { + auto ExecBundle = + get_kernel_bundle(Ctx, {Dev}); + kernel Kernel = ExecBundle.template get_kernel(); + MaxWGSize = Kernel.template get_info(Dev); + } + + return MaxWGSize; +} + namespace reduction { template struct MainKrn; template struct AuxKrn; @@ -1302,6 +1321,8 @@ struct NDRangeReduction< reduction::strategy::group_reduce_and_last_wg_detection, decltype(NWorkGroupsFinished)>; + WGSize = std::min(WGSize, reduGetPreferredKernelWGSize(Queue)); + CGH.parallel_for(NDRange, Properties, [=](nd_item<1> NDId) { // Call user's functions. Reducer.MValue gets initialized there. typename Reduction::reducer_type Reducer; @@ -1515,6 +1536,8 @@ template <> struct NDRangeReduction { using Name = __sycl_reduction_kernel; + WGSize = std::min(WGSize, reduGetPreferredKernelWGSize(Queue)); + CGH.parallel_for(NDRange, Properties, [=](nd_item<1> NDId) { // Call user's functions. Reducer.MValue gets initialized there. reducer_type Reducer = reducer_type(IdentityContainer, BOp); @@ -1628,7 +1651,6 @@ struct NDRangeReduction< using reducer_type = typename Reduction::reducer_type; using element_type = typename ReducerTraits::element_type; - std::ignore = Queue; using Name = __sycl_reduction_kernel< reduction::MainKrn, KernelName, reduction::strategy::local_mem_tree_and_atomic_cross_wg>; @@ -1636,6 +1658,7 @@ struct NDRangeReduction< size_t NElements = Reduction::num_elements; size_t WGSize = NDRange.get_local_range().size(); + WGSize = std::min(WGSize, reduGetPreferredKernelWGSize(Queue)); // Use local memory to reduce elements in work-groups into zero-th // element. local_accessor LocalReds{WGSize, CGH}; @@ -1722,6 +1745,8 @@ struct NDRangeReduction< reduction::MainKrn, KernelName, reduction::strategy::group_reduce_and_multiple_kernels>; + MaxWGSize = std::min(MaxWGSize, reduGetPreferredKernelWGSize(Queue)); + CGH.parallel_for(NDRange, Properties, [=](nd_item NDIt) { // Call user's functions. Reducer.MValue gets initialized there. typename Reduction::reducer_type Reducer; @@ -1781,6 +1806,8 @@ struct NDRangeReduction< reduction::AuxKrn, KernelName, reduction::strategy::group_reduce_and_multiple_kernels>; + WGSize = std::min(WGSize, reduGetPreferredKernelWGSize(Queue)); + bool IsUpdateOfUserVar = !Reduction::is_usm && !Redu.initializeToIdentity() && NWorkGroups == 1; @@ -1874,6 +1901,9 @@ template <> struct NDRangeReduction { reduction::strategy::basic, decltype(KernelTag)>; + MaxWGSize = + std::min(MaxWGSize, reduGetPreferredKernelWGSize(Queue)); + CGH.parallel_for(NDRange, Properties, [=](nd_item NDIt) { // Call user's functions. Reducer.MValue gets initialized there. typename Reduction::reducer_type Reducer = @@ -1978,6 +2008,8 @@ template <> struct NDRangeReduction { reduction::strategy::basic, decltype(KernelTag)>; + WGSize = std::min(WGSize, reduGetPreferredKernelWGSize(Queue)); + range<1> GlobalRange = {UniformPow2WG ? NWorkItems : NWorkGroups * WGSize}; nd_range<1> Range{GlobalRange, range<1>(WGSize)}; @@ -2295,8 +2327,9 @@ template struct NDRangeMulti; } // namespace reduction::main_krn template -void reduCGFuncMulti(handler &CGH, KernelType KernelFunc, - const nd_range &Range, PropertiesT Properties, +void reduCGFuncMulti(handler &CGH, std::shared_ptr &Queue, + KernelType KernelFunc, const nd_range &Range, + PropertiesT Properties, std::tuple &ReduTuple, std::index_sequence ReduIndices) { size_t WGSize = Range.get_local_range().size(); @@ -2334,6 +2367,8 @@ void reduCGFuncMulti(handler &CGH, KernelType KernelFunc, reduction::strategy::multi, decltype(KernelTag)>; + WGSize = std::min(WGSize, reduGetPreferredKernelWGSize(Queue)); + CGH.parallel_for(Range, Properties, [=](nd_item NDIt) { // We can deduce IsOneWG from the tag type. constexpr bool IsOneWG = @@ -2495,7 +2530,8 @@ template struct Multi; } // namespace reduction::aux_krn template -size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, +size_t reduAuxCGFunc(handler &CGH, std::shared_ptr &Queue, + size_t NWorkItems, size_t MaxWGSize, std::tuple &ReduTuple, std::index_sequence ReduIndices) { size_t NWorkGroups; @@ -2533,6 +2569,8 @@ size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, using Name = __sycl_reduction_kernel; + WGSize = std::min(WGSize, reduGetPreferredKernelWGSize(Queue)); + // TODO: Opportunity to parallelize across number of elements range<1> GlobalRange = {HasUniformWG ? NWorkItems : NWorkGroups * WGSize}; nd_range<1> Range{GlobalRange, range<1>(WGSize)}; @@ -2617,15 +2655,15 @@ template <> struct NDRangeReduction { " than " + std::to_string(MaxWGSize)); - reduCGFuncMulti(CGH, KernelFunc, NDRange, Properties, ReduTuple, - ReduIndices); + reduCGFuncMulti(CGH, Queue, KernelFunc, NDRange, Properties, + ReduTuple, ReduIndices); reduction::finalizeHandler(CGH); size_t NWorkItems = NDRange.get_group_range().size(); while (NWorkItems > 1) { reduction::withAuxHandler(CGH, [&](handler &AuxHandler) { NWorkItems = reduAuxCGFunc( - AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices); + AuxHandler, Queue, NWorkItems, MaxWGSize, ReduTuple, ReduIndices); }); } // end while (NWorkItems > 1) } @@ -2741,7 +2779,29 @@ void reduction_parallel_for(handler &CGH, range Range, // TODO: currently the preferred work group size is determined for the given // queue/device, while it is safer to use queries to the kernel pre-compiled // for the device. - size_t PrefWGSize = reduGetPreferredWGSize(CGH.MQueue, OneElemSize); + size_t PrefWGSize = reduGetPreferredDeviceWGSize(CGH.MQueue, OneElemSize); + + auto SyclQueue = createSyclObjFromImpl(CGH.MQueue); + auto Ctx = SyclQueue.get_context(); + auto Dev = SyclQueue.get_device(); + + // If the reduction kernel is not name defined, we won't be able to query the + // exact kernel for the best wgsize, so we query all the reduction kernels for + // thier wgsize and use the minimum wgsize as a safe and approximate option. + constexpr bool IsUndefinedKernelName{std::is_same_v}; + if (IsUndefinedKernelName) { + std::vector ReductionKernelIDs = get_kernel_ids(); + for (auto KernelID : ReductionKernelIDs) { + std::string ReduKernelName = KernelID.get_name(); + if (ReduKernelName.find("reduction") != std::string::npos) { + auto KB = get_kernel_bundle(Ctx, {KernelID}); + kernel krn = KB.get_kernel(KernelID); + using namespace info::kernel_device_specific; + size_t MaxSize = krn.template get_info(Dev); + PrefWGSize = std::min(PrefWGSize, MaxSize); + } + } + } size_t NWorkItems = Range.size(); size_t WGSize = std::min(NWorkItems, PrefWGSize); diff --git a/sycl/source/detail/reduction.cpp b/sycl/source/detail/reduction.cpp index 6e2d49d909f29..ef02bdba2d966 100644 --- a/sycl/source/detail/reduction.cpp +++ b/sycl/source/detail/reduction.cpp @@ -113,8 +113,8 @@ reduGetMaxWGSize(std::shared_ptr Queue, return WGSize; } -__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr &Queue, - size_t LocalMemBytesPerWorkItem) { +__SYCL_EXPORT size_t reduGetPreferredDeviceWGSize( + std::shared_ptr &Queue, size_t LocalMemBytesPerWorkItem) { // TODO: Graphs extension explicit API uses a handler with a null queue to // process CGFs, in future we should have access to the device so we can // correctly calculate this. diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index a5134a7a524ca..2570f0f195dfd 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3294,7 +3294,7 @@ _ZN4sycl3_V16detail22get_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6devi _ZN4sycl3_V16detail22get_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EERKS5_INS0_9kernel_idESaISB_EENS0_12bundle_stateE _ZN4sycl3_V16detail22has_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EENS0_12bundle_stateE _ZN4sycl3_V16detail22has_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EERKS5_INS0_9kernel_idESaISB_EENS0_12bundle_stateE -_ZN4sycl3_V16detail22reduGetPreferredWGSizeERSt10shared_ptrINS1_10queue_implEEm +_ZN4sycl3_V16detail28reduGetPreferredDeviceWGSizeERSt10shared_ptrINS1_10queue_implEEm _ZN4sycl3_V16detail22removeDuplicateDevicesERKSt6vectorINS0_6deviceESaIS3_EE _ZN4sycl3_V16detail23constructorNotificationEPvS2_NS0_6access6targetENS3_4modeERKNS1_13code_locationE _ZN4sycl3_V16detail24find_device_intersectionERKSt6vectorINS0_13kernel_bundleILNS0_12bundle_stateE1EEESaIS5_EE diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index a6e6a5e47c137..afd9f941a4eb7 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -4206,7 +4206,7 @@ ?reduComputeWGSize@detail@_V1@sycl@@YA_K_K0AEA_K@Z ?reduGetMaxNumConcurrentWorkGroups@detail@_V1@sycl@@YAIV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@@Z ?reduGetMaxWGSize@detail@_V1@sycl@@YA_KV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K@Z -?reduGetPreferredWGSize@detail@_V1@sycl@@YA_KAEAV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K@Z +?reduGetPreferredDeviceWGSize@detail@_V1@sycl@@YA_KAEAV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K@Z ?registerDynamicParameter@handler@_V1@sycl@@AEAAXAEAVdynamic_parameter_base@detail@experimental@oneapi@ext@23@H@Z ?release_external_memory@experimental@oneapi@ext@_V1@sycl@@YAXUexternal_mem@12345@AEBVdevice@45@AEBVcontext@45@@Z ?release_external_memory@experimental@oneapi@ext@_V1@sycl@@YAXUexternal_mem@12345@AEBVqueue@45@@Z