From 082e2c573f00dd6947136663b79d1a52838f87bb Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 26 May 2025 08:09:46 +0000 Subject: [PATCH] [WIP] No handler submit --- .../sycl/khr/free_function_commands.hpp | 19 ++- sycl/include/sycl/queue.hpp | 113 ++++++++++++++++++ sycl/source/detail/queue_impl.cpp | 113 ++++++++++++++++++ sycl/source/detail/queue_impl.hpp | 12 ++ sycl/source/queue.cpp | 27 +++++ 5 files changed, 274 insertions(+), 10 deletions(-) diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index 4138edd5821e6..eb99c0d124525 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -5,6 +5,7 @@ namespace sycl { inline namespace _V1 { +#define __DPCPP_ENABLE_UNFINISHED_KHR_EXTENSIONS #ifdef __DPCPP_ENABLE_UNFINISHED_KHR_EXTENSIONS namespace khr { @@ -153,27 +154,24 @@ void launch_grouped(const queue &q, range<1> r, range<1> size, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - submit( - q, [&](handler &h) { launch_grouped(h, r, size, k); }, - codeLoc); + (void)codeLoc; + q.parallel_for_no_handler(nd_range<1>(r, size), k); } template void launch_grouped(const queue &q, range<2> r, range<2> size, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - submit( - q, [&](handler &h) { launch_grouped(h, r, size, k); }, - codeLoc); + (void)codeLoc; + q.parallel_for_no_handler(nd_range<2>(r, size), k); } template void launch_grouped(const queue &q, range<3> r, range<3> size, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - submit( - q, [&](handler &h) { launch_grouped(h, r, size, k); }, - codeLoc); + (void)codeLoc; + q.parallel_for_no_handler(nd_range<3>(r, size), k); } template @@ -283,7 +281,8 @@ template void launch_task(const sycl::queue &q, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - submit(q, [&](handler &h) { launch_task(h, k); }, codeLoc); + (void)codeLoc; + q.single_task_no_handler(k); } template diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index a0dbdf5c540e8..2ec6d2e46ddf4 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -2784,6 +2784,112 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { CodeLoc); } + // no_handler + +private: + // NOTE: the name of this function - "kernel_single_task" - is used by the + // Front End to determine kernel invocation kind. + template +#ifdef __SYCL_DEVICE_ONLY__ + [[__sycl_detail__::add_ir_attributes_function( + "sycl-single-task", + ext::oneapi::experimental::detail::PropertyMetaInfo::name..., + nullptr, + ext::oneapi::experimental::detail::PropertyMetaInfo::value...)]] +#endif + + __SYCL_KERNEL_ATTR__ static void + kernel_single_task(const KernelType &KernelFunc) { +#ifdef __SYCL_DEVICE_ONLY__ + KernelFunc(); +#else + (void)KernelFunc; +#endif + } + + // NOTE: the name of these functions - "kernel_parallel_for" - are used by the + // Front End to determine kernel invocation kind. + template +#ifdef __SYCL_DEVICE_ONLY__ + [[__sycl_detail__::add_ir_attributes_function( + ext::oneapi::experimental::detail::PropertyMetaInfo::name..., + ext::oneapi::experimental::detail::PropertyMetaInfo::value...)]] +#endif + __SYCL_KERNEL_ATTR__ static void + kernel_parallel_for(const KernelType &KernelFunc) { +#ifdef __SYCL_DEVICE_ONLY__ + KernelFunc(detail::Builder::getElement(detail::declptr())); +#else + (void)KernelFunc; +#endif + } + + template static sycl::range<3> padRange(sycl::range Range) { + if constexpr (Dims == 3) { + return Range; + } else { + sycl::range<3> Res{0, 0, 0}; + for (int I = 0; I < Dims; ++I) + Res[I] = Range[I]; + return Res; + } + } + + template static sycl::id<3> padId(sycl::id Id) { + if constexpr (Dims == 3) { + return Id; + } else { + sycl::id<3> Res{0, 0, 0}; + for (int I = 0; I < Dims; ++I) + Res[I] = Id[I]; + return Res; + } + } + + template + void submit_no_handler(nd_range Range, const KernelType &KernelFunc) const { + + using NameT = + typename detail::get_kernel_name_t::name; + + const char *KernelN = detail::getKernelName(); + KernelType Kernel = KernelFunc; + void *KernelFuncPtr = reinterpret_cast(&Kernel); + int KernelNumParams = detail::getKernelNumParams(); + detail::kernel_param_desc_t (*KernelParamDescGetter)(int) = &(detail::getKernelParamDesc); + bool IsKernelESIMD = detail::isKernelESIMD(); + bool HasSpecialCapt = detail::hasSpecialCaptures(); + detail::KernelNameBasedCacheT *KernelNameBasedCachePtr = detail::getKernelNameBasedCache(); + + assert(HasSpecialCapt == false); + assert(IsKernelESIMD == false); + + submit_no_handler_impl(Range, KernelN, KernelFuncPtr, KernelNumParams, KernelParamDescGetter, + KernelNameBasedCachePtr); + } + +public: + /// single_task version not using handler + template + void single_task_no_handler(const KernelType &KernelFunc) const { + + kernel_single_task(KernelFunc); + submit_no_handler(nd_range<1>{}, KernelFunc); + } + + template + void parallel_for_no_handler(nd_range Range, const KernelType &KernelFunc) const { + + kernel_parallel_for, KernelType, + ext::oneapi::experimental::empty_properties_t>(KernelFunc); + submit_no_handler(Range, KernelFunc); + } + + + /// parallel_for version with a kernel represented as a lambda + range that /// specifies global size only. /// @@ -3686,6 +3792,13 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + // no_handler + + template + void submit_no_handler_impl(nd_range Range, const char *KernelName, void *KernelFunc, + int KernelNumParams, detail::kernel_param_desc_t (*KernelParamDescGetter)(int), + detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) const; + /// Submits a command group function object to the queue, in order to be /// scheduled for execution on the device. /// diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 193eee39b5ce4..9c37faf84327a 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -415,6 +415,119 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, } #endif +// no_handler + +void queue_impl::extractArgsAndReqsFromLambda( + char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), + size_t NumKernelParams, std::vector &Args) { + size_t IndexShift = 0; + + Args.reserve(NumKernelParams); + + for (size_t I = 0; I < NumKernelParams; ++I) { + detail::kernel_param_desc_t ParamDesc = ParamDescGetter(I); + void *Ptr = LambdaPtr + ParamDesc.offset; + const detail::kernel_param_kind_t &Kind = ParamDesc.kind; + const int &Size = ParamDesc.info; + + Args.emplace_back(Kind, Ptr, Size, I + IndexShift); + } +} + +void queue_impl::submit_no_handler( + const std::shared_ptr &Self, + detail::NDRDescT NDRDesc, const char *KernelName, + void *KernelFunc, int KernelNumParams, + detail::kernel_param_desc_t (*KernelParamDescGetter)(int), + detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) { + + std::vector RawEvents; + std::vector Args; + + assert(!MQueue->hasCommandGraph()); + + // TODO external event + + bool KernelFastPath = true; + + std::unique_lock Lock(MMutex); + + EventImplPtr &LastEvent = MDefaultGraphDeps.LastEventPtr; + + if (isInOrder() && LastEvent && !Scheduler::CheckEventReadiness(MContext, LastEvent)) { + KernelFastPath = false; + } + + if (KernelFastPath) { + EventImplPtr &LastEvent = MDefaultGraphDeps.LastEventPtr; + + if (isInOrder() && LastEvent && LastEvent->getHandle()) { + RawEvents.push_back(LastEvent->getHandle()); + } + + enqueueImpKernel( + Self, + NDRDesc, // MNDRDesc + Args, + nullptr, // KernelBundleImpPtr + nullptr, // MKernel + KernelName, + KernelNameBasedCachePtr, // MKernelNameBasedCachePtr + RawEvents, + nullptr, // out event + nullptr, // getMemAllocationFunc + UR_KERNEL_CACHE_CONFIG_DEFAULT, // MKernelCacheConfig + false, // MKernelIsCooperative + false, // MKernelUsesClusterLaunch + 0, // MKernelWorkGroupMemorySize + nullptr, // BinImage + KernelFunc, // MKernelFuncPtr + KernelNumParams, // MKernelNumArgs + KernelParamDescGetter, // MKernelParamDescGetter + false); // MKernelHasSpecialCaptures + } else { + std::unique_ptr CommandGroup; + detail::CG::StorageInitHelper CGData; + std::vector Args; + std::vector> StreamStorage; + std::vector> AuxiliaryResources; + detail::code_location CodeLoc = {}; + + extractArgsAndReqsFromLambda((char *)KernelFunc, KernelParamDescGetter, + KernelNumParams, Args); + + EventImplPtr &LastEvent = MDefaultGraphDeps.LastEventPtr; + CGData.MEvents.push_back(LastEvent); + + CommandGroup.reset(new detail::CGExecKernel( + std::move(NDRDesc), + nullptr, // MHostKernel + nullptr, // MKernel + nullptr, // MKernelBundle + std::move(CGData), // CGData + std::move(Args), // MArgs + KernelName, // MKernelName + KernelNameBasedCachePtr, // MKernelNameBasedCachePtr + std::move(StreamStorage), // MStreamStorage + std::move(AuxiliaryResources), // MAuxiliaryResources + detail::CGType::Kernel, + UR_KERNEL_CACHE_CONFIG_DEFAULT, // MKernelCacheConfig + false, // MKernelIsCooperative + false, // MKernelUsesClusterLaunch + 0, // MKernelWorkGroupMemorySize + CodeLoc)); // MCodeLoc + + detail::EventImplPtr EventImpl = detail::Scheduler::getInstance().addCG( + std::move(CommandGroup), + Self, // MQueue + false); // MEventNeeded + + if (isInOrder()) { + MDefaultGraphDeps.LastEventPtr = EventImpl; + } + } +} + template event queue_impl::submitWithHandler(const std::vector &DepEvents, bool CallerNeedsEvent, diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 99490ba2851c4..8461c64f83dc0 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -378,6 +378,18 @@ class queue_impl : public std::enable_shared_from_this { /*CallerNeedsEvent=*/false, Loc, IsTopCodeLoc, SubmitInfo); } + // no_handler +private: + void extractArgsAndReqsFromLambda( + char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), + size_t NumKernelParams, std::vector &Args); + +public: + void submit_no_handler(const std::shared_ptr &Self, + detail::NDRDescT NDRDesc, const char *KernelName, void *KernelFunc, int KernelNumParams, + detail::kernel_param_desc_t (*KernelParamDescGetter)(int), + detail::KernelNameBasedCacheT *KernelNameBasedCachePtr); + /// Performs a blocking wait for the completion of all enqueued tasks in the /// queue. /// diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 8b39334f5b432..ed2468fc2f8dd 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -319,6 +319,21 @@ void queue::submit_without_event_impl( impl->submit_without_event(CGH, SubmitInfo, CodeLoc, IsTopCodeLoc); } +// no_handler + +template +void queue::submit_no_handler_impl(nd_range Range, const char *KernelName, void *KernelFunc, + int KernelNumParams, detail::kernel_param_desc_t (*KernelParamDescGetter)(int), + detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) const { + + detail::NDRDescT NDRDesc{padRange(Range.get_global_range()), + padRange(Range.get_local_range()), + padId(Range.get_offset()), Dims}; + + impl->submit_no_handler(impl, NDRDesc, KernelName, KernelFunc, KernelNumParams, + KernelParamDescGetter, KernelNameBasedCachePtr); +} + void queue::wait_proxy(const detail::code_location &CodeLoc) { impl->wait(CodeLoc); } @@ -474,6 +489,18 @@ void queue::ext_oneapi_set_external_event(const event &external_event) { const property_list &queue::getPropList() const { return impl->getPropList(); } +template void queue::submit_no_handler_impl<1>(nd_range<1> Range, const char *KernelName, void *KernelFunc, + int KernelNumParams, detail::kernel_param_desc_t (*KernelParamDescGetter)(int), + detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) const; + +template void queue::submit_no_handler_impl<2>(nd_range<2> Range, const char *KernelName, void *KernelFunc, + int KernelNumParams, detail::kernel_param_desc_t (*KernelParamDescGetter)(int), + detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) const; + +template void queue::submit_no_handler_impl<3>(nd_range<3> Range, const char *KernelName, void *KernelFunc, + int KernelNumParams, detail::kernel_param_desc_t (*KernelParamDescGetter)(int), + detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) const; + } // namespace _V1 } // namespace sycl