From 9189aaddd66cc27b5169d9094304dada97f46f1a Mon Sep 17 00:00:00 2001 From: "Tracy, Benjamin" Date: Fri, 31 Jan 2025 14:29:50 +0100 Subject: [PATCH 1/9] [SYCL][Graph] Enable host-task update in graphs - Update spec wording to allow updating host-task function in graphs - Support host-tasks in dynamic command-groups - Support host-tasks in whole graph update - Add E2E tests for both scenarios - Fix passing incorrect accessors to graph update command after update --- .../sycl_ext_oneapi_graph.asciidoc | 92 +++++++---- sycl/source/detail/graph_impl.cpp | 126 ++++++++++----- sycl/source/detail/graph_impl.hpp | 75 ++++++--- sycl/source/detail/scheduler/commands.cpp | 51 ++++-- .../Graph/Inputs/whole_update_host_task.cpp | 152 ++++++++++++++++++ .../whole_update_host_task_accessor.cpp | 152 ++++++++++++++++++ .../Explicit/whole_update_host_task.cpp | 12 ++ .../whole_update_host_task_accessor.cpp | 10 ++ .../RecordReplay/whole_update_host_task.cpp | 12 ++ .../whole_update_host_task_accessor.cpp | 10 ++ .../Update/dyn_cgf_host_task_accessor.cpp | 152 ++++++++++++++++++ .../Graph/Update/dyn_cgf_host_task_usm.cpp | 151 +++++++++++++++++ 12 files changed, 891 insertions(+), 104 deletions(-) create mode 100644 sycl/test-e2e/Graph/Inputs/whole_update_host_task.cpp create mode 100644 sycl/test-e2e/Graph/Inputs/whole_update_host_task_accessor.cpp create mode 100644 sycl/test-e2e/Graph/Update/Explicit/whole_update_host_task.cpp create mode 100644 sycl/test-e2e/Graph/Update/Explicit/whole_update_host_task_accessor.cpp create mode 100644 sycl/test-e2e/Graph/Update/RecordReplay/whole_update_host_task.cpp create mode 100644 sycl/test-e2e/Graph/Update/RecordReplay/whole_update_host_task_accessor.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_host_task_accessor.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_host_task_usm.cpp diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index 649c00fb474b5..00c8eb8e0cdf4 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -551,7 +551,7 @@ Parameters: |=== -==== Dynamic Command Groups +==== Dynamic Command Groups [[dynamic-command-groups]] [source,c++] ---- @@ -570,12 +570,13 @@ public: Dynamic command-groups can be added as nodes to a graph. They provide a mechanism that allows updating the command-group function of a node after the graph is finalized. There is always one command-group function in the dynamic -command-group that is set as active, this is the kernel which will execute for -the node when the graph is finalized into an executable state `command_graph`, -and all the other command-group functions in `cgfList` will be ignored. The -executable `command_graph` node can then be updated to a different kernel in -`cgfList`, by selecting a new active index on the dynamic command-group object -and calling the `update(node& node)` method on the executable `command_graph`. +command-group that is set as active, this is the command-group which will +execute for the node when the graph is finalized into an executable state +`command_graph`, and all the other command-group functions in `cgfList` will be +ignored. The executable `command_graph` node can then be updated to a different +kernel in `cgfList`, by selecting a new active index on the dynamic +command-group object and calling the `update(node& node)` method on the +executable `command_graph`. The `dynamic_command_group` class provides the {crs}[common reference semantics]. @@ -584,9 +585,13 @@ about updating command-groups. ===== Limitations -Dynamic command-groups can only contain kernel operations. Trying to construct -a dynamic command-group with functions that contain other operations will -result in an error. +Dynamic command-groups can only contain the following operations: + +* Kernel operations +* <> + +Trying to construct a dynamic command-group with functions that contain other +operations will result in an error. All the command-group functions in a dynamic command-group must have identical dependencies. It is not allowed for a dynamic command-group to have command-group functions that would @@ -625,10 +630,13 @@ Exceptions: property for more information. * Throws with error code `invalid` if the `dynamic_command_group` is created with - command-group functions that are not kernel executions. + command-group functions that are not kernel executions or host-tasks. * Throws with error code `invalid` if `cgfList` is empty. +* Throws with error code `invalid` if the types of all command-groups in + `cgfList` do not match. + | [source,c++] ---- @@ -829,10 +837,12 @@ possible. ===== Supported Features -The only types of nodes that are currently able to be updated in a graph are -kernel execution nodes. +The only types of nodes that are currently able to be updated in a graph are: -There are two different API's that can be used to update a graph: +* Kernel executions +* <> + +There are two different APIs that can be used to update a graph: * <> which allows updating individual nodes of a command-graph. @@ -840,21 +850,41 @@ individual nodes of a command-graph. entirety of the graph simultaneously by using another graph as a reference. -The aspects of a kernel execution node that can be changed during update are -different depending on the API used to perform the update: +The following table illustrates the aspects of each supported node type that can be changed +depending on the API used to perform the update. + +Table {counter: tableNumber}. Graph update capabilites for supported node types. +[cols="1,2a,2a"] +|=== +|Node Type|<>|<> + +|`node_type::kernel` +| + +* Kernel function +* Kernel Parameters +* ND-range + +| +* Kernel Parameters +* ND-range -* For the <> API it's possible to update -the kernel function, the parameters to the kernel, and the ND-range. -* For the <> API, only the parameters of the kernel -and the ND-range can be updated. +|`node_type::host_task` +| +* Host-task function +| +* Host-task function + +|=== ===== Individual Node Update [[individual-node-update]] -Individual nodes of an executable graph can be updated directly. Depending on the attribute -of the node that requires updating, different API's should be used: +Individual nodes of an executable graph can be updated directly. Depending on the attribute or `node_type` of the node that requires updating, different API's should be used: ====== Parameter Updates +_Supported Node Types: Kernel_ + Parameters to individual nodes in a graph in the `executable` state can be updated between graph executions using dynamic parameters. A `dynamic_parameter` object is created with a modifiable state graph and an initial value for the @@ -884,6 +914,8 @@ will maintain the graphs data dependencies. ====== Execution Range Updates +_Supported Node Types: Kernel_ + Another configuration that can be updated is the execution range of the kernel, this can be set through `node::update_nd_range()` or `node::update_range()` but does not require any prior registration. @@ -897,10 +929,13 @@ code may be defined as operating in a different dimension. ====== Command Group Updates -The command-groups of a kernel node can be updated using dynamic command-groups. -Dynamic command-groups allow replacing the command-group function of a kernel -node with a different one. This effectively allows updating the kernel function -and/or the kernel execution range. +_Supported Node Types: Kernel, Host-task_ + +The command-groups of a kernel node can be updated using +<>. Dynamic command-groups allow +replacing the command-group function of a kernel node with a different one. This +effectively allows updating the kernel function and/or the kernel execution +range. Command-group updates are performed by creating an instance of the `dynamic_command_group` class. A dynamic command-group is created with a modifiable @@ -1972,7 +2007,7 @@ Any code like this should be moved to a separate host-task and added to the graph via the recording or explicit APIs in order to be compatible with this extension. -=== Host Tasks +=== Host Tasks [[host-tasks]] :host-task: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:interfaces.hosttasks @@ -1992,6 +2027,9 @@ auto node = graph.add([&](sycl::handler& cgh){ }); ---- +Host-tasks can be updated using <>. + + === Queue Behavior In Recording Mode When a queue is placed in recording mode via a call to `command_graph::begin_recording`, diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index c89353efc0f4d..77a4ff9921849 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -581,9 +581,9 @@ graph_impl::add(std::shared_ptr &DynCGImpl, std::vector> &Deps) { // Set of Dependent nodes based on CG event and accessor dependencies. std::set> DynCGDeps = - getCGEdges(DynCGImpl->MKernels[0]); + getCGEdges(DynCGImpl->MCommandGroups[0]); for (unsigned i = 1; i < DynCGImpl->getNumCGs(); i++) { - auto &CG = DynCGImpl->MKernels[i]; + auto &CG = DynCGImpl->MCommandGroups[i]; auto CGEdges = getCGEdges(CG); if (CGEdges != DynCGDeps) { throw sycl::exception(make_error_code(sycl::errc::invalid), @@ -593,14 +593,16 @@ graph_impl::add(std::shared_ptr &DynCGImpl, } // Track and mark the memory objects being used by the graph. - for (auto &CG : DynCGImpl->MKernels) { + for (auto &CG : DynCGImpl->MCommandGroups) { markCGMemObjs(CG); } // Get active dynamic command-group CG and use to create a node object - const auto &ActiveKernel = DynCGImpl->getActiveKernel(); + const auto &ActiveKernel = DynCGImpl->getActiveCG(); + node_type NodeType = + ext::oneapi::experimental::detail::getNodeTypeFromCG(DynCGImpl->MCGType); std::shared_ptr NodeImpl = - add(node_type::kernel, ActiveKernel, Deps); + add(NodeType, ActiveKernel, Deps); // Add an event associated with this explicit node for mixed usage addEventForNode(std::make_shared(), NodeImpl); @@ -1400,11 +1402,11 @@ void exec_graph_impl::update( "Node passed to update() is not part of the graph."); } - if (!(Node->isEmpty() || Node->MCGType == sycl::detail::CGType::Kernel || - Node->MCGType == sycl::detail::CGType::Barrier)) { - throw sycl::exception(errc::invalid, - "Unsupported node type for update. Only kernel, " - "barrier and empty nodes are supported."); + if (!Node->isUpdatable()) { + throw sycl::exception( + errc::invalid, + "Unsupported node type for update. Only kernel, host_task, " + "barrier and empty nodes are supported."); } if (const auto &CG = Node->MCommandGroup; @@ -1445,23 +1447,46 @@ void exec_graph_impl::update( } } - // Rebuild cached requirements for this graph with updated nodes + // Rebuild cached requirements and accessor storage for this graph with + // updated nodes MRequirements.clear(); + MAccessors.clear(); for (auto &Node : MNodeStorage) { if (!Node->MCommandGroup) continue; MRequirements.insert(MRequirements.end(), Node->MCommandGroup->getRequirements().begin(), Node->MCommandGroup->getRequirements().end()); + MAccessors.insert(MAccessors.end(), + Node->MCommandGroup->getAccStorage().begin(), + Node->MCommandGroup->getAccStorage().end()); } } void exec_graph_impl::updateImpl(std::shared_ptr Node) { - // Kernel node update is the only command type supported in UR for update. - // Updating any other types of nodes, e.g. empty & barrier nodes is a no-op. - if (Node->MCGType != sycl::detail::CGType::Kernel) { + // Updating empty or barrier nodes is a no-op + if (Node->isEmpty() || Node->MNodeType == node_type::ext_oneapi_barrier) { + return; + } + + // Query the ID cache to find the equivalent exec node for the node passed to + // this function. + // TODO: Handle subgraphs or any other cases where multiple nodes may be + // associated with a single key, once those node types are supported for + // update. + auto ExecNode = MIDCache.find(Node->MID); + assert(ExecNode != MIDCache.end() && "Node ID was not found in ID cache"); + + // Update ExecNode with new values from Node, in case we ever need to + // rebuild the command buffers + ExecNode->second->updateFromOtherNode(Node); + + // Host task update only requires updating the node itself, so can return + // early + if (Node->MNodeType == node_type::host_task) { return; } + auto ContextImpl = sycl::detail::getSyclObjImpl(MContext); const sycl::detail::AdapterPtr &Adapter = ContextImpl->getAdapter(); auto DeviceImpl = sycl::detail::getSyclObjImpl(MGraphImpl->getDevice()); @@ -1614,18 +1639,6 @@ void exec_graph_impl::updateImpl(std::shared_ptr Node) { UpdateDesc.pNewLocalWorkSize = LocalSize; UpdateDesc.newWorkDim = NDRDesc.Dims; - // Query the ID cache to find the equivalent exec node for the node passed to - // this function. - // TODO: Handle subgraphs or any other cases where multiple nodes may be - // associated with a single key, once those node types are supported for - // update. - auto ExecNode = MIDCache.find(Node->MID); - assert(ExecNode != MIDCache.end() && "Node ID was not found in ID cache"); - - // Update ExecNode with new values from Node, in case we ever need to - // rebuild the command buffers - ExecNode->second->updateFromOtherNode(Node); - ur_exp_command_buffer_command_handle_t Command = MCommandMap[ExecNode->second]; ur_result_t Res = Adapter->call_nocheck< @@ -1929,7 +1942,7 @@ void dynamic_parameter_impl::updateValue(const void *NewValue, size_t Size) { for (auto &DynCGInfo : MDynCGs) { auto DynCG = DynCGInfo.DynCG.lock(); if (DynCG) { - auto &CG = DynCG->MKernels[DynCGInfo.CGIndex]; + auto &CG = DynCG->MCommandGroups[DynCGInfo.CGIndex]; dynamic_parameter_impl::updateCGArgValue(CG, DynCGInfo.ArgIndex, NewValue, Size); } @@ -1952,7 +1965,7 @@ void dynamic_parameter_impl::updateAccessor( for (auto &DynCGInfo : MDynCGs) { auto DynCG = DynCGInfo.DynCG.lock(); if (DynCG) { - auto &CG = DynCG->MKernels[DynCGInfo.CGIndex]; + auto &CG = DynCG->MCommandGroups[DynCGInfo.CGIndex]; dynamic_parameter_impl::updateCGAccessor(CG, DynCGInfo.ArgIndex, Acc); } } @@ -2040,38 +2053,67 @@ void dynamic_command_group_impl::finalizeCGFList( sycl::handler Handler{MGraph}; CGF(Handler); - if (Handler.getType() != sycl::detail::CGType::Kernel) { + if (Handler.getType() != sycl::detail::CGType::Kernel && + Handler.getType() != sycl::detail::CGType::CodeplayHostTask) { throw sycl::exception( make_error_code(errc::invalid), - "The only type of command-groups that can be used in " - "dynamic command-groups is kernels."); + "The only types of command-groups that can be used in " + "dynamic command-groups are kernels and host-tasks."); + } + + // We need to store the first CG's type so we can check they are all the + // same + if (CGFIndex == 0) { + MCGType = Handler.getType(); + } else if (MCGType != Handler.getType()) { + throw sycl::exception(make_error_code(errc::invalid), + "Command-groups in a dynamic command-group must " + "all be the same type."); } Handler.finalize(); // Take unique_ptr object from handler and convert to - // shared_ptr to store + // shared_ptr to store sycl::detail::CG *RawCGPtr = Handler.impl->MGraphNodeCG.release(); - auto RawCGExecPtr = static_cast(RawCGPtr); - MKernels.push_back( - std::shared_ptr(RawCGExecPtr)); + MCommandGroups.push_back(std::shared_ptr(RawCGPtr)); - // Track dynamic_parameter usage in command-list + // Track dynamic_parameter usage in command-group auto &DynamicParams = Handler.impl->MDynamicParameters; + + if (DynamicParams.size() > 0 && + Handler.getType() == sycl::detail::CGType::CodeplayHostTask) { + throw sycl::exception(make_error_code(errc::invalid), + "Cannot use dynamic parameters in a host_task"); + } for (auto &[DynamicParam, ArgIndex] : DynamicParams) { DynamicParam->registerDynCG(shared_from_this(), CGFIndex, ArgIndex); } } - // For each CGExecKernel store the list of alternative kernels, not + // Host tasks don't need to store alternative kernels + if (MCGType == sycl::detail::CGType::CodeplayHostTask) { + return; + } + + // For each Kernel CG store the list of alternative kernels, not // including itself. using CGExecKernelSP = std::shared_ptr; using CGExecKernelWP = std::weak_ptr; - for (auto KernelCG : MKernels) { + for (std::shared_ptr CommandGroup : MCommandGroups) { + CGExecKernelSP KernelCG = + std::dynamic_pointer_cast(CommandGroup); std::vector Alternatives; - std::copy_if( - MKernels.begin(), MKernels.end(), std::back_inserter(Alternatives), - [&KernelCG](const CGExecKernelSP &K) { return K != KernelCG; }); + + // Add all other command groups except for the current one to the list of + // alternatives + for (auto &OtherCG : MCommandGroups) { + CGExecKernelSP OtherKernelCG = + std::dynamic_pointer_cast(OtherCG); + if (KernelCG != OtherKernelCG) { + Alternatives.push_back(OtherKernelCG); + } + } KernelCG->MAlternativeKernels = std::move(Alternatives); } @@ -2087,7 +2129,7 @@ void dynamic_command_group_impl::setActiveIndex(size_t Index) { // Update nodes using the dynamic command-group to use the new active CG for (auto &Node : MNodes) { if (auto NodeSP = Node.lock()) { - NodeSP->MCommandGroup = getActiveKernel(); + NodeSP->MCommandGroup = getActiveCG(); } } } diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 1a1f2ef9cf55f..05798a6332d8d 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -446,25 +446,58 @@ class node_impl : public std::enable_shared_from_this { } void updateFromOtherNode(const std::shared_ptr &Other) { - auto ExecCG = - static_cast(MCommandGroup.get()); - auto OtherExecCG = - static_cast(Other->MCommandGroup.get()); - - ExecCG->MArgs = OtherExecCG->MArgs; - ExecCG->MNDRDesc = OtherExecCG->MNDRDesc; - ExecCG->MKernelName = OtherExecCG->MKernelName; - ExecCG->getAccStorage() = OtherExecCG->getAccStorage(); - ExecCG->getRequirements() = OtherExecCG->getRequirements(); - - auto &OldArgStorage = OtherExecCG->getArgsStorage(); - auto &NewArgStorage = ExecCG->getArgsStorage(); - // Rebuild the arg storage and update the args - rebuildArgStorage(ExecCG->MArgs, OldArgStorage, NewArgStorage); + switch (MNodeType) { + case node_type::kernel: { + auto ExecCG = + static_cast(MCommandGroup.get()); + auto OtherExecCG = + static_cast(Other->MCommandGroup.get()); + + ExecCG->MArgs = OtherExecCG->MArgs; + ExecCG->MNDRDesc = OtherExecCG->MNDRDesc; + ExecCG->MKernelName = OtherExecCG->MKernelName; + ExecCG->getAccStorage() = OtherExecCG->getAccStorage(); + ExecCG->getRequirements() = OtherExecCG->getRequirements(); + + auto &OldArgStorage = OtherExecCG->getArgsStorage(); + auto &NewArgStorage = ExecCG->getArgsStorage(); + // Rebuild the arg storage and update the args + rebuildArgStorage(ExecCG->MArgs, OldArgStorage, NewArgStorage); + break; + } + case node_type::host_task: { + auto HostTaskCG = + static_cast(MCommandGroup.get()); + auto OtherHostTaskCG = + static_cast(Other->MCommandGroup.get()); + + HostTaskCG->MArgs = OtherHostTaskCG->MArgs; + HostTaskCG->getAccStorage() = OtherHostTaskCG->getAccStorage(); + HostTaskCG->getRequirements() = OtherHostTaskCG->getRequirements(); + HostTaskCG->MHostTask = OtherHostTaskCG->MHostTask; + break; + } + default: + break; + } } id_type getID() const { return MID; } + /// Returns true if this node can be updated + bool isUpdatable() const { + switch (MNodeType) { + case node_type::kernel: + case node_type::host_task: + case node_type::ext_oneapi_barrier: + case node_type::empty: + return true; + + default: + return false; + } + } + private: void rebuildArgStorage(std::vector &Args, const std::vector> &OldArgStorage, @@ -1533,7 +1566,7 @@ class dynamic_command_group_impl size_t getActiveIndex() const { return MActiveCGF; } /// Returns the number of CGs in the dynamic command-group. - size_t getNumCGs() const { return MKernels.size(); } + size_t getNumCGs() const { return MCommandGroups.size(); } /// Set the index of the active command-group. /// @param Index The new index. @@ -1546,8 +1579,8 @@ class dynamic_command_group_impl /// Retrieve CG at the currently active index /// @param Shared pointer to the active CG object. - std::shared_ptr getActiveKernel() const { - return MKernels[MActiveCGF]; + std::shared_ptr getActiveCG() const { + return MCommandGroups[MActiveCGF]; } /// Graph this dynamic command-group is associated with. @@ -1556,14 +1589,16 @@ class dynamic_command_group_impl /// Index of active command-group std::atomic MActiveCGF; - /// List of kernel command-groups for dynamic command-group nodes - std::vector> MKernels; + /// List of command-groups for dynamic command-group nodes + std::vector> MCommandGroups; /// List of nodes using this dynamic command-group. std::vector> MNodes; unsigned long long getID() const { return MID; } + /// Type of the CGs in this dynamic command-group + sycl::detail::CGType MCGType = sycl::detail::CGType::None; private: unsigned long long MID; // Used for std::hash in order to create a unique hash for the instance. diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 005008a74ebd0..3486045196d31 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -3709,25 +3709,46 @@ ur_result_t UpdateCommandBufferCommand::enqueueImp() { Command::waitForEvents(MQueue, EventImpls, UREvent); MEvent->setHandle(UREvent); + auto CheckAndFindAlloca = [](Requirement *Req, const DepDesc &Dep) { + if (Dep.MDepRequirement == Req) { + if (Dep.MAllocaCmd) { + Req->MData = Dep.MAllocaCmd->getMemAllocation(); + } else { + throw sycl::exception(make_error_code(errc::invalid), + "No allocation available for accessor when " + "updating command buffer!"); + } + } + }; + for (auto &Node : MNodes) { - auto CG = static_cast(Node->MCommandGroup.get()); - for (auto &Arg : CG->MArgs) { - if (Arg.MType != kernel_param_kind_t::kind_accessor) { - continue; + CG *CG = Node->MCommandGroup.get(); + switch (Node->MNodeType) { + case ext::oneapi::experimental::node_type::kernel: { + auto CGExec = static_cast(CG); + for (auto &Arg : CGExec->MArgs) { + if (Arg.MType != kernel_param_kind_t::kind_accessor) { + continue; + } + // Search through deps to get actual allocation for accessor args. + for (const DepDesc &Dep : MDeps) { + Requirement *Req = static_cast(Arg.MPtr); + CheckAndFindAlloca(Req, Dep); + } } - // Search through deps to get actual allocation for accessor args. - for (const DepDesc &Dep : MDeps) { - Requirement *Req = static_cast(Arg.MPtr); - if (Dep.MDepRequirement == Req) { - if (Dep.MAllocaCmd) { - Req->MData = Dep.MAllocaCmd->getMemAllocation(); - } else { - throw sycl::exception(make_error_code(errc::invalid), - "No allocation available for accessor when " - "updating command buffer!"); - } + break; + } + case ext::oneapi::experimental::node_type::host_task: { + for (auto &Req : CG->getRequirements()) { + for (const DepDesc &Dep : MDeps) { + CheckAndFindAlloca(Req, Dep); } } + + break; + } + default: + break; } MGraph->updateImpl(Node); } diff --git a/sycl/test-e2e/Graph/Inputs/whole_update_host_task.cpp b/sycl/test-e2e/Graph/Inputs/whole_update_host_task.cpp new file mode 100644 index 0000000000000..994363371a7ef --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/whole_update_host_task.cpp @@ -0,0 +1,152 @@ +// Tests whole graph update with a host-task in the graph + +#include "../graph_common.hpp" + +using T = int; + +void calculate_reference_data(size_t Iterations, size_t Size, + std::vector &ReferenceA, + std::vector &ReferenceB, + std::vector &ReferenceC, T ModValue) { + for (size_t n = 0; n < Iterations; n++) { + for (size_t i = 0; i < Size; i++) { + ReferenceA[i]++; + ReferenceB[i] += ReferenceA[i]; + ReferenceC[i] -= ReferenceA[i]; + ReferenceB[i]--; + ReferenceC[i]--; + ReferenceC[i] += ModValue; + ReferenceC[i] *= 2; + } + } +} + +void add_nodes_to_graph( + exp_ext::command_graph &Graph, + queue &Queue, T *PtrA, T *PtrB, T *PtrC, T ModValue) { + // Add some commands to the graph + auto LastOperation = add_nodes(Graph, Queue, Size, PtrA, PtrB, PtrC); + + // Add a host task which modifies PtrC + auto HostTaskOp = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, LastOperation); + CGH.host_task([=]() { + for (size_t i = 0; i < Size; i++) { + PtrC[i] += ModValue; + } + }); + }, + LastOperation); + + // Add another node that depends on the host-task + add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, HostTaskOp); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + PtrC[Item.get_linear_id()] *= 2; + }); + }, + HostTaskOp); +} + +int main() { + queue Queue{}; + + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + auto DataA2 = DataA; + auto DataB2 = DataB; + auto DataC2 = DataC; + + const T ModValue = 7; + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB, ReferenceC, + ModValue); + + exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + T *PtrB = malloc_device(Size, Queue); + T *PtrC = malloc_shared(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.copy(DataC.data(), PtrC, Size); + Queue.wait_and_throw(); + + // Fill graphA with nodes + add_nodes_to_graph(GraphA, Queue, PtrA, PtrB, PtrC, ModValue); + + auto GraphExec = GraphA.finalize(exp_ext::property::graph::updatable{}); + + exp_ext::command_graph GraphB{Queue.get_context(), Queue.get_device()}; + + T *PtrA2 = malloc_device(Size, Queue); + T *PtrB2 = malloc_device(Size, Queue); + T *PtrC2 = malloc_shared(Size, Queue); + + Queue.copy(DataA2.data(), PtrA2, Size); + Queue.copy(DataB2.data(), PtrB2, Size); + Queue.copy(DataC2.data(), PtrC2, Size); + Queue.wait_and_throw(); + + // Fill graphB with nodes, with a different set of pointers + add_nodes_to_graph(GraphB, Queue, PtrA2, PtrB2, PtrC2, ModValue); + + // Execute several Iterations of the graph for 1st set of buffers + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + + GraphExec.update(GraphB); + + // Execute several Iterations of the graph for 2nd set of buffers + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.copy(PtrC, DataC.data(), Size); + + Queue.copy(PtrA2, DataA2.data(), Size); + Queue.copy(PtrB2, DataB2.data(), Size); + Queue.copy(PtrC2, DataC2.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + free(PtrB, Queue); + free(PtrC, Queue); + + free(PtrA2, Queue); + free(PtrB2, Queue); + free(PtrC2, Queue); + + for (size_t i = 0; i < Size; i++) { + assert(check_value(i, ReferenceA[i], DataA[i], "DataA")); + assert(check_value(i, ReferenceB[i], DataB[i], "DataB")); + assert(check_value(i, ReferenceC[i], DataC[i], "DataC")); + + assert(check_value(i, ReferenceA[i], DataA2[i], "DataA2")); + assert(check_value(i, ReferenceB[i], DataB2[i], "DataB2")); + assert(check_value(i, ReferenceC[i], DataC2[i], "DataC2")); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/whole_update_host_task_accessor.cpp b/sycl/test-e2e/Graph/Inputs/whole_update_host_task_accessor.cpp new file mode 100644 index 0000000000000..4da9ca8245f47 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/whole_update_host_task_accessor.cpp @@ -0,0 +1,152 @@ + +// Tests whole graph update with a host-task in the graph + +#include "../graph_common.hpp" + +using T = int; + +void calculate_reference_data(size_t Iterations, size_t Size, + std::vector &ReferenceA, + std::vector &ReferenceB, + std::vector &ReferenceC, T ModValue) { + for (size_t n = 0; n < Iterations; n++) { + for (size_t i = 0; i < Size; i++) { + ReferenceA[i]++; + ReferenceB[i] += ReferenceA[i]; + ReferenceC[i] -= ReferenceA[i]; + ReferenceB[i]--; + ReferenceC[i]--; + ReferenceC[i] += ModValue; + ReferenceC[i] *= 2; + } + } +} + +void add_nodes_to_graph( + exp_ext::command_graph &Graph, + queue &Queue, buffer &BufferA, buffer &BufferB, buffer &BufferC, + const T &ModValue) { + // Add some commands to the graph + auto LastOperation = add_nodes(Graph, Queue, Size, BufferA, BufferB, BufferC); + + // Add a host task which modifies PtrC + auto HostTaskOp = add_node( + Graph, Queue, + [&](handler &CGH) { + auto AccC = BufferC.get_access(CGH); + depends_on_helper(CGH, LastOperation); + CGH.host_task([=]() { + // std::cout << "AccC[0] " << AccC[0] << std::endl; + for (size_t i = 0; i < Size; i++) { + AccC[i] += ModValue; + } + }); + }, + LastOperation); + + // Add another node that depends on the host-task + add_node( + Graph, Queue, + [&](handler &CGH) { + auto AccC = BufferC.get_access(CGH); + depends_on_helper(CGH, HostTaskOp); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + AccC[Item.get_linear_id()] *= 2; + }); + }, + HostTaskOp); +} + +int main() { + queue Queue{}; + + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + auto DataA2 = DataA; + auto DataB2 = DataB; + auto DataC2 = DataC; + + const T ModValue = 7; + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB, ReferenceC, + ModValue); + + buffer BufferA{DataA.data(), range<1>{DataA.size()}}; + buffer BufferB{DataB.data(), range<1>{DataB.size()}}; + buffer BufferC{DataC.data(), range<1>{DataC.size()}}; + buffer BufferA2{DataA2.data(), range<1>{DataA2.size()}}; + buffer BufferB2{DataB2.data(), range<1>{DataB2.size()}}; + buffer BufferC2{DataC2.data(), range<1>{DataC2.size()}}; + BufferA.set_write_back(false); + BufferB.set_write_back(false); + BufferC.set_write_back(false); + BufferA2.set_write_back(false); + BufferB2.set_write_back(false); + BufferC2.set_write_back(false); + + { + exp_ext::command_graph GraphA{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + // Fill graphA with nodes + add_nodes_to_graph(GraphA, Queue, BufferA, BufferB, BufferC, ModValue); + + auto GraphExec = GraphA.finalize(exp_ext::property::graph::updatable{}); + + exp_ext::command_graph GraphB{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + // Fill graphB with nodes, with a different set of pointers + add_nodes_to_graph(GraphB, Queue, BufferA2, BufferB2, BufferC2, ModValue); + + // Execute several Iterations of the graph for 1st set of buffers + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + // CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + Queue.wait_and_throw(); + + GraphExec.update(GraphB); + + // Execute several Iterations of the graph for 2nd set of buffers + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + + Queue.wait_and_throw(); + } + Queue.copy(BufferA.get_access(), DataA.data()); + Queue.copy(BufferB.get_access(), DataB.data()); + Queue.copy(BufferC.get_access(), DataC.data()); + Queue.copy(BufferA2.get_access(), DataA2.data()); + Queue.copy(BufferB2.get_access(), DataB2.data()); + Queue.copy(BufferC2.get_access(), DataC2.data()); + Queue.wait_and_throw(); + + for (size_t i = 0; i < Size; i++) { + assert(check_value(i, ReferenceA[i], DataA[i], "DataA")); + assert(check_value(i, ReferenceB[i], DataB[i], "DataB")); + assert(check_value(i, ReferenceC[i], DataC[i], "DataC")); + + assert(check_value(i, ReferenceA[i], DataA2[i], "DataA2")); + assert(check_value(i, ReferenceB[i], DataB2[i], "DataB2")); + assert(check_value(i, ReferenceC[i], DataC2[i], "DataC2")); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/Explicit/whole_update_host_task.cpp b/sycl/test-e2e/Graph/Update/Explicit/whole_update_host_task.cpp new file mode 100644 index 0000000000000..ffcfbff01348e --- /dev/null +++ b/sycl/test-e2e/Graph/Update/Explicit/whole_update_host_task.cpp @@ -0,0 +1,12 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// REQUIRES: aspect-usm_shared_allocations + +#define GRAPH_E2E_EXPLICIT + +#include "../../Inputs/whole_update_host_task.cpp" diff --git a/sycl/test-e2e/Graph/Update/Explicit/whole_update_host_task_accessor.cpp b/sycl/test-e2e/Graph/Update/Explicit/whole_update_host_task_accessor.cpp new file mode 100644 index 0000000000000..3bd87ba061c73 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/Explicit/whole_update_host_task_accessor.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../../Inputs/whole_update_host_task_accessor.cpp" diff --git a/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_host_task.cpp b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_host_task.cpp new file mode 100644 index 0000000000000..7d09c89cbc591 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_host_task.cpp @@ -0,0 +1,12 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// REQUIRES: aspect-usm_shared_allocations + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../../Inputs/whole_update_host_task.cpp" diff --git a/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_host_task_accessor.cpp b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_host_task_accessor.cpp new file mode 100644 index 0000000000000..ed13e566eca7d --- /dev/null +++ b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_host_task_accessor.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../../Inputs/whole_update_host_task_accessor.cpp" diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_host_task_accessor.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_host_task_accessor.cpp new file mode 100644 index 0000000000000..8bb9b8e7252bc --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_host_task_accessor.cpp @@ -0,0 +1,152 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// Tests using dynamic command groups to update a host task node that also uses +// buffers/accessors +#include "../graph_common.hpp" + +using T = int; + +void calculate_reference_data(size_t Iterations, size_t Size, + std::vector &ReferenceA, + std::vector &ReferenceB, + std::vector &ReferenceC, T ModValue) { + for (size_t n = 0; n < Iterations; n++) { + for (size_t i = 0; i < Size; i++) { + ReferenceA[i]++; + ReferenceB[i] += ReferenceA[i]; + ReferenceC[i] -= ReferenceA[i]; + ReferenceB[i]--; + ReferenceC[i]--; + ReferenceC[i] += ModValue; + ReferenceC[i] *= 2; + } + } +} + +int main() { + queue Queue{}; + + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + auto DataA2 = DataA; + auto DataB2 = DataB; + auto DataC2 = DataC; + + const T ModValue = 7; + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB, ReferenceC, + ModValue); + // We're not updating the buffers here so we need to stack the reference + // calculations for after the host-task update + std::vector ReferenceA2(ReferenceA), ReferenceB2(ReferenceB), + ReferenceC2(ReferenceC); + calculate_reference_data(Iterations, Size, ReferenceA2, ReferenceB2, + ReferenceC2, ModValue * 2); + + buffer BufferA{DataA.data(), range<1>{DataA.size()}}; + BufferA.set_write_back(false); + buffer BufferB{DataB.data(), range<1>{DataB.size()}}; + BufferB.set_write_back(false); + buffer BufferC{DataC.data(), range<1>{DataC.size()}}; + BufferC.set_write_back(false); + + { + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + // Add some commands to the graph + auto LastOperation = add_kernels(Graph, Size, BufferA, BufferB, BufferC); + + // Create two different command groups for the host task + auto CGFA = [&](handler &CGH) { + auto AccC = BufferC.get_access(CGH); + CGH.host_task([=]() { + for (size_t i = 0; i < Size; i++) { + AccC[i] += ModValue; + } + }); + }; + auto CGFB = [&](handler &CGH) { + auto AccC = BufferC.get_access(CGH); + CGH.host_task([=]() { + for (size_t i = 0; i < Size; i++) { + AccC[i] += ModValue * 2; + } + }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + // Add a host task which modifies PtrC + auto HostTaskNode = Graph.add( + DynamicCG, exp_ext::property::node::depends_on{LastOperation}); + + // Add another node that depends on the host-task + Graph.add( + [&](handler &CGH) { + auto AccC = BufferC.get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + AccC[Item.get_linear_id()] *= 2; + }); + }, + exp_ext::property::node::depends_on{HostTaskNode}); + + auto GraphExec = Graph.finalize(exp_ext::property::graph::updatable{}); + + // Execute several Iterations of the graph with the first host-task (CGFA) + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + Queue.wait_and_throw(); + Queue.copy(BufferA.get_access(), DataA.data()); + Queue.copy(BufferB.get_access(), DataB.data()); + Queue.copy(BufferC.get_access(), DataC.data()); + Queue.wait_and_throw(); + for (size_t i = 0; i < Size; i++) { + assert(check_value(i, ReferenceA[i], DataA[i], "DataA")); + assert(check_value(i, ReferenceB[i], DataB[i], "DataB")); + assert(check_value(i, ReferenceC[i], DataC[i], "DataC")); + } + + // Update to CGFB + DynamicCG.set_active_index(1); + GraphExec.update(HostTaskNode); + + // Execute several Iterations of the graph for second host-task (CGFB) + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + + Queue.wait_and_throw(); + Queue.copy(BufferA.get_access(), DataA2.data()); + Queue.copy(BufferB.get_access(), DataB2.data()); + Queue.copy(BufferC.get_access(), DataC2.data()); + Queue.wait_and_throw(); + for (size_t i = 0; i < Size; i++) { + assert(check_value(i, ReferenceA2[i], DataA2[i], "DataA2")); + assert(check_value(i, ReferenceB2[i], DataB2[i], "DataB2")); + assert(check_value(i, ReferenceC2[i], DataC2[i], "DataC2")); + } + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_host_task_usm.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_host_task_usm.cpp new file mode 100644 index 0000000000000..dadc3606d0854 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_host_task_usm.cpp @@ -0,0 +1,151 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// REQUIRES: aspect-usm_shared_allocations + +// Tests using dynamic command groups to update a host task node +#include "../graph_common.hpp" + +using T = int; + +void calculate_reference_data(size_t Iterations, size_t Size, + std::vector &ReferenceA, + std::vector &ReferenceB, + std::vector &ReferenceC, T ModValue) { + for (size_t n = 0; n < Iterations; n++) { + for (size_t i = 0; i < Size; i++) { + ReferenceA[i]++; + ReferenceB[i] += ReferenceA[i]; + ReferenceC[i] -= ReferenceA[i]; + ReferenceB[i]--; + ReferenceC[i]--; + ReferenceC[i] += ModValue; + ReferenceC[i] *= 2; + } + } +} + +int main() { + queue Queue{}; + + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + auto DataA2 = DataA; + auto DataB2 = DataB; + auto DataC2 = DataC; + + const T ModValue = 7; + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB, ReferenceC, + ModValue); + // We're not updating the buffers here so we need to stack the reference + // calculations for after the host-task update + std::vector ReferenceA2(ReferenceA), ReferenceB2(ReferenceB), + ReferenceC2(ReferenceC); + calculate_reference_data(Iterations, Size, ReferenceA2, ReferenceB2, + ReferenceC2, ModValue * 2); + + T *PtrA = sycl::malloc_device(Size, Queue); + T *PtrB = sycl::malloc_device(Size, Queue); + T *PtrC = sycl::malloc_shared(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.copy(DataC.data(), PtrC, Size); + Queue.wait_and_throw(); + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + // Add some commands to the graph + auto LastOperation = add_kernels_usm(Graph, Size, PtrA, PtrB, PtrC); + + // Create two different command groups for the host task + auto CGFA = [&](handler &CGH) { + CGH.host_task([=]() { + for (size_t i = 0; i < Size; i++) { + PtrC[i] += ModValue; + } + }); + }; + auto CGFB = [&](handler &CGH) { + CGH.host_task([=]() { + for (size_t i = 0; i < Size; i++) { + PtrC[i] += ModValue * 2; + } + }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + // Add a host task which modifies PtrC using the dynamic CG + auto HostTaskNode = + Graph.add(DynamicCG, exp_ext::property::node::depends_on{LastOperation}); + + // Add another node that depends on the host-task + Graph.add( + [&](handler &CGH) { + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + PtrC[Item.get_linear_id()] *= 2; + }); + }, + exp_ext::property::node::depends_on{HostTaskNode}); + + auto GraphExec = Graph.finalize(exp_ext::property::graph::updatable{}); + + // Execute several Iterations of the graph with the first host-task (CGFA) + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + Queue.wait_and_throw(); + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.copy(PtrC, DataC.data(), Size); + Queue.wait_and_throw(); + for (size_t i = 0; i < Size; i++) { + assert(check_value(i, ReferenceA[i], DataA[i], "DataA")); + assert(check_value(i, ReferenceB[i], DataB[i], "DataB")); + assert(check_value(i, ReferenceC[i], DataC[i], "DataC")); + } + + // Update to CGFB + DynamicCG.set_active_index(1); + GraphExec.update(HostTaskNode); + + // Execute several Iterations of the graph for second host-task (CGFB) + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + + Queue.wait_and_throw(); + Queue.copy(PtrA, DataA2.data(), Size); + Queue.copy(PtrB, DataB2.data(), Size); + Queue.copy(PtrC, DataC2.data(), Size); + Queue.wait_and_throw(); + for (size_t i = 0; i < Size; i++) { + assert(check_value(i, ReferenceA2[i], DataA2[i], "DataA2")); + assert(check_value(i, ReferenceB2[i], DataB2[i], "DataB2")); + assert(check_value(i, ReferenceC2[i], DataC2[i], "DataC2")); + } + + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + sycl::free(PtrC, Queue); + return 0; +} From 373d3d2b6bd54da74d5f95e0c61f105dd0ab1bff Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Thu, 6 Feb 2025 18:34:10 +0000 Subject: [PATCH 2/9] Fix host-tasks being enqueued before they should be updated. --- sycl/source/detail/graph_impl.cpp | 25 ++++++++++++++++--- sycl/source/detail/graph_impl.hpp | 7 ++++++ .../Graph/Inputs/whole_update_host_task.cpp | 11 +++----- 3 files changed, 33 insertions(+), 10 deletions(-) diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 77a4ff9921849..51fedb09ffbfc 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -196,6 +196,7 @@ void exec_graph_impl::makePartitions() { } } + MContainsHostTask = HostTaskList.size() > 0; // Annotate nodes // The first step in graph partitioning is to annotate all nodes of the graph // with a temporary partition or group number. This step allows us to group @@ -1080,6 +1081,16 @@ exec_graph_impl::enqueue(const std::shared_ptr &Queue, } else if ((CurrentPartition->MSchedule.size() > 0) && (CurrentPartition->MSchedule.front()->MCGType == sycl::detail::CGType::CodeplayHostTask)) { + // If we have pending updates then we need to make sure that they are + // completed before the host-task is enqueued, to ensure it has received + // those updates prior to calling node->getCGCopy() + if (MUpdateEvents.size() > 0) { + for (auto &Event : MUpdateEvents) { + Event->wait_and_throw(Event); + } + MUpdateEvents.clear(); + } + auto NodeImpl = CurrentPartition->MSchedule.front(); // Schedule host task NodeImpl->MCommandGroup->getEvents().insert( @@ -1438,9 +1449,17 @@ void exec_graph_impl::update( sycl::detail::getSyclObjImpl(MGraphImpl->getDevice()), sycl::detail::getSyclObjImpl(MGraphImpl->getContext()), sycl::async_handler{}, sycl::property_list{}); - // Don't need to care about the return event here because it is synchronous - sycl::detail::Scheduler::getInstance().addCommandGraphUpdate( - this, Nodes, AllocaQueue, UpdateRequirements, MExecutionEvents); + + auto UpdateEvent = + sycl::detail::Scheduler::getInstance().addCommandGraphUpdate( + this, Nodes, AllocaQueue, UpdateRequirements, MExecutionEvents); + + // If the graph contains host-task(s) we need to track update events so we + // can explicitly wait on them before enqueue further host-tasks to ensure + // updates have taken effect. + if (MContainsHostTask) { + MUpdateEvents.push_back(UpdateEvent); + } } else { for (auto &Node : Nodes) { updateImpl(Node); diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 05798a6332d8d..bd0609b120e14 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -475,6 +475,7 @@ class node_impl : public std::enable_shared_from_this { HostTaskCG->getAccStorage() = OtherHostTaskCG->getAccStorage(); HostTaskCG->getRequirements() = OtherHostTaskCG->getRequirements(); HostTaskCG->MHostTask = OtherHostTaskCG->MHostTask; + HostTaskCG->getEvents() = OtherHostTaskCG->getEvents(); break; } default: @@ -1453,6 +1454,12 @@ class exec_graph_impl { unsigned long long MID; // Used for std::hash in order to create a unique hash for the instance. inline static std::atomic NextAvailableID = 0; + // True if this graph contains any host-tasks, controls whether we store + // events in MUpdateEvents. + bool MContainsHostTask = false; + // Contains events for updates submitted through the scheduler as we need to + // wait on them when enqueuing host-tasks. + std::vector MUpdateEvents; }; class dynamic_parameter_impl { diff --git a/sycl/test-e2e/Graph/Inputs/whole_update_host_task.cpp b/sycl/test-e2e/Graph/Inputs/whole_update_host_task.cpp index 994363371a7ef..41759b3a76452 100644 --- a/sycl/test-e2e/Graph/Inputs/whole_update_host_task.cpp +++ b/sycl/test-e2e/Graph/Inputs/whole_update_host_task.cpp @@ -100,23 +100,20 @@ int main() { // Fill graphB with nodes, with a different set of pointers add_nodes_to_graph(GraphB, Queue, PtrA2, PtrB2, PtrC2, ModValue); - // Execute several Iterations of the graph for 1st set of buffers + // Execute several Iterations of the graph, updating in between each + // execution. event Event; for (unsigned n = 0; n < Iterations; n++) { Event = Queue.submit([&](handler &CGH) { CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); - } - - GraphExec.update(GraphB); - - // Execute several Iterations of the graph for 2nd set of buffers - for (unsigned n = 0; n < Iterations; n++) { + GraphExec.update(GraphB); Event = Queue.submit([&](handler &CGH) { CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); + GraphExec.update(GraphA); } Queue.wait_and_throw(); From bc20c3279f9ce5e0fe646ed06f8c4cf60630bad8 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Thu, 13 Feb 2025 16:50:23 +0000 Subject: [PATCH 3/9] Address issues with previous solution - Instead of trying to update host tasks through scheduler, simply do host-task updates immediately before scheduling the rest of the commands. --- sycl/source/detail/graph_impl.cpp | 49 ++++++++++++++++++++----------- sycl/source/detail/graph_impl.hpp | 10 +++---- 2 files changed, 36 insertions(+), 23 deletions(-) diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 51fedb09ffbfc..0a7819acdc037 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -1081,15 +1081,6 @@ exec_graph_impl::enqueue(const std::shared_ptr &Queue, } else if ((CurrentPartition->MSchedule.size() > 0) && (CurrentPartition->MSchedule.front()->MCGType == sycl::detail::CGType::CodeplayHostTask)) { - // If we have pending updates then we need to make sure that they are - // completed before the host-task is enqueued, to ensure it has received - // those updates prior to calling node->getCGCopy() - if (MUpdateEvents.size() > 0) { - for (auto &Event : MUpdateEvents) { - Event->wait_and_throw(Event); - } - MUpdateEvents.clear(); - } auto NodeImpl = CurrentPartition->MSchedule.front(); // Schedule host task @@ -1389,7 +1380,7 @@ void exec_graph_impl::update(std::shared_ptr Node) { } void exec_graph_impl::update( - const std::vector> Nodes) { + const std::vector> &Nodes) { if (!MIsUpdatable) { throw sycl::exception(sycl::make_error_code(errc::invalid), @@ -1445,6 +1436,34 @@ void exec_graph_impl::update( NeedScheduledUpdate |= MExecutionEvents.size() > 0; if (NeedScheduledUpdate) { + // Copy the list of nodes as we may need to modify it + auto NodesCopy = Nodes; + + // If the graph contains host tasks we need special handling here because + // their state lives in the graph object itself, so we must do the update + // immediately here. Whereas all other command state lives in the backend so + // it can be scheduled along with other commands. + if (MContainsHostTask) { + std::vector> HostTasks; + // Remove any nodes that are host tasks and put them in HostTasks + auto RemovedIter = std::remove_if( + NodesCopy.begin(), NodesCopy.end(), + [&HostTasks](const std::shared_ptr &Node) -> bool { + if (Node->MNodeType == node_type::host_task) { + HostTasks.push_back(Node); + return true; + } + return false; + }); + // Clean up extra elements in NodesCopy after the remove + NodesCopy.erase(RemovedIter, NodesCopy.end()); + + // Update host-tasks synchronously + for (auto &HostTaskNode : HostTasks) { + updateImpl(HostTaskNode); + } + } + auto AllocaQueue = std::make_shared( sycl::detail::getSyclObjImpl(MGraphImpl->getDevice()), sycl::detail::getSyclObjImpl(MGraphImpl->getContext()), @@ -1452,14 +1471,10 @@ void exec_graph_impl::update( auto UpdateEvent = sycl::detail::Scheduler::getInstance().addCommandGraphUpdate( - this, Nodes, AllocaQueue, UpdateRequirements, MExecutionEvents); + this, std::move(NodesCopy), AllocaQueue, UpdateRequirements, + MExecutionEvents); - // If the graph contains host-task(s) we need to track update events so we - // can explicitly wait on them before enqueue further host-tasks to ensure - // updates have taken effect. - if (MContainsHostTask) { - MUpdateEvents.push_back(UpdateEvent); - } + MExecutionEvents.push_back(UpdateEvent); } else { for (auto &Node : Nodes) { updateImpl(Node); diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index bd0609b120e14..ee7d94fe7ba32 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -1333,7 +1333,7 @@ class exec_graph_impl { void update(std::shared_ptr GraphImpl); void update(std::shared_ptr Node); - void update(const std::vector> Nodes); + void update(const std::vector> &Nodes); void updateImpl(std::shared_ptr NodeImpl); @@ -1454,12 +1454,10 @@ class exec_graph_impl { unsigned long long MID; // Used for std::hash in order to create a unique hash for the instance. inline static std::atomic NextAvailableID = 0; - // True if this graph contains any host-tasks, controls whether we store - // events in MUpdateEvents. + + // True if this graph contains any host-tasks, indicates we need special + // handling for them during update(). bool MContainsHostTask = false; - // Contains events for updates submitted through the scheduler as we need to - // wait on them when enqueuing host-tasks. - std::vector MUpdateEvents; }; class dynamic_parameter_impl { From 535de535434e44a091250e97914b353dee80fe5d Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Fri, 14 Feb 2025 11:49:24 +0000 Subject: [PATCH 4/9] Fix formatting --- sycl/source/detail/graph_impl.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index ee7d94fe7ba32..66b07babd0b35 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -1604,6 +1604,7 @@ class dynamic_command_group_impl /// Type of the CGs in this dynamic command-group sycl::detail::CGType MCGType = sycl::detail::CGType::None; + private: unsigned long long MID; // Used for std::hash in order to create a unique hash for the instance. From f7856a587d573fe147561dc26b5dd719c9e10571 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Fri, 14 Feb 2025 13:30:54 +0000 Subject: [PATCH 5/9] Removed unused code from commands.cpp --- sycl/source/detail/scheduler/commands.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 3486045196d31..175e1b3937259 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -3738,15 +3738,6 @@ ur_result_t UpdateCommandBufferCommand::enqueueImp() { } break; } - case ext::oneapi::experimental::node_type::host_task: { - for (auto &Req : CG->getRequirements()) { - for (const DepDesc &Dep : MDeps) { - CheckAndFindAlloca(Req, Dep); - } - } - - break; - } default: break; } From df5dd4697869ef2b07430771c0e788c906eeaed0 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Fri, 14 Feb 2025 13:38:15 +0000 Subject: [PATCH 6/9] Addressing minor comments - Fix errors in spec - Fix commented out code in test --- .../extensions/experimental/sycl_ext_oneapi_graph.asciidoc | 4 ++-- .../Graph/Inputs/whole_update_host_task_accessor.cpp | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index 00c8eb8e0cdf4..88c027beeab55 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -862,11 +862,11 @@ Table {counter: tableNumber}. Graph update capabilites for supported node types. | * Kernel function -* Kernel Parameters +* Kernel parameters * ND-range | -* Kernel Parameters +* Kernel parameters * ND-range |`node_type::host_task` diff --git a/sycl/test-e2e/Graph/Inputs/whole_update_host_task_accessor.cpp b/sycl/test-e2e/Graph/Inputs/whole_update_host_task_accessor.cpp index 4da9ca8245f47..d94e8e3fe2284 100644 --- a/sycl/test-e2e/Graph/Inputs/whole_update_host_task_accessor.cpp +++ b/sycl/test-e2e/Graph/Inputs/whole_update_host_task_accessor.cpp @@ -1,5 +1,4 @@ - -// Tests whole graph update with a host-task in the graph +// Tests whole graph update with a host-task in the graph using accessors #include "../graph_common.hpp" @@ -37,7 +36,6 @@ void add_nodes_to_graph( access::target::host_task>(CGH); depends_on_helper(CGH, LastOperation); CGH.host_task([=]() { - // std::cout << "AccC[0] " << AccC[0] << std::endl; for (size_t i = 0; i < Size; i++) { AccC[i] += ModValue; } @@ -112,7 +110,7 @@ int main() { event Event; for (unsigned n = 0; n < Iterations; n++) { Event = Queue.submit([&](handler &CGH) { - // CGH.depends_on(Event); + CGH.depends_on(Event); CGH.ext_oneapi_graph(GraphExec); }); } From 019a2708b1b8ea4456bae30d161ffbdcc1572379 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Fri, 14 Feb 2025 14:48:56 +0000 Subject: [PATCH 7/9] Simplify implementation of node_impl::updateFromOtherNode --- sycl/source/detail/graph_impl.hpp | 40 ++++--------------------------- 1 file changed, 4 insertions(+), 36 deletions(-) diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 66b07babd0b35..b958e74a0d43a 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -444,43 +444,11 @@ class node_impl : public std::enable_shared_from_this { NDRDesc = sycl::detail::NDRDescT{ExecutionRange}; } - + /// Update this node with the command-group from another node. + /// @param Other The other node to update, must be of the same node type. void updateFromOtherNode(const std::shared_ptr &Other) { - switch (MNodeType) { - case node_type::kernel: { - auto ExecCG = - static_cast(MCommandGroup.get()); - auto OtherExecCG = - static_cast(Other->MCommandGroup.get()); - - ExecCG->MArgs = OtherExecCG->MArgs; - ExecCG->MNDRDesc = OtherExecCG->MNDRDesc; - ExecCG->MKernelName = OtherExecCG->MKernelName; - ExecCG->getAccStorage() = OtherExecCG->getAccStorage(); - ExecCG->getRequirements() = OtherExecCG->getRequirements(); - - auto &OldArgStorage = OtherExecCG->getArgsStorage(); - auto &NewArgStorage = ExecCG->getArgsStorage(); - // Rebuild the arg storage and update the args - rebuildArgStorage(ExecCG->MArgs, OldArgStorage, NewArgStorage); - break; - } - case node_type::host_task: { - auto HostTaskCG = - static_cast(MCommandGroup.get()); - auto OtherHostTaskCG = - static_cast(Other->MCommandGroup.get()); - - HostTaskCG->MArgs = OtherHostTaskCG->MArgs; - HostTaskCG->getAccStorage() = OtherHostTaskCG->getAccStorage(); - HostTaskCG->getRequirements() = OtherHostTaskCG->getRequirements(); - HostTaskCG->MHostTask = OtherHostTaskCG->MHostTask; - HostTaskCG->getEvents() = OtherHostTaskCG->getEvents(); - break; - } - default: - break; - } + assert(MNodeType == Other->MNodeType); + MCommandGroup = std::move(Other->getCGCopy()); } id_type getID() const { return MID; } From 84b5d7cd8554d6f47a773de9d4169a28fe8be09e Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Fri, 14 Feb 2025 14:58:40 +0000 Subject: [PATCH 8/9] Remove std::move --- sycl/source/detail/graph_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index b958e74a0d43a..915002c5f8483 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -448,7 +448,7 @@ class node_impl : public std::enable_shared_from_this { /// @param Other The other node to update, must be of the same node type. void updateFromOtherNode(const std::shared_ptr &Other) { assert(MNodeType == Other->MNodeType); - MCommandGroup = std::move(Other->getCGCopy()); + MCommandGroup = Other->getCGCopy(); } id_type getID() const { return MID; } From 62f93afc1bc637987e1de8f1c862bd74055f8bab Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Mon, 17 Feb 2025 12:05:25 +0000 Subject: [PATCH 9/9] Add comment about tracking update event --- sycl/source/detail/graph_impl.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 0a7819acdc037..e06cfc368f6fe 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -1469,6 +1469,8 @@ void exec_graph_impl::update( sycl::detail::getSyclObjImpl(MGraphImpl->getContext()), sycl::async_handler{}, sycl::property_list{}); + // Track the event for the update command since execution may be blocked by + // other scheduler commands auto UpdateEvent = sycl::detail::Scheduler::getInstance().addCommandGraphUpdate( this, std::move(NodesCopy), AllocaQueue, UpdateRequirements,