From a61fe8e0d2ff5b1afc0854f8f52aa87ca04e5e10 Mon Sep 17 00:00:00 2001
From: "Wu, Yingcong" <yingcong.wu@intel.com>
Date: Mon, 28 Oct 2024 07:28:00 +0100
Subject: [PATCH 01/36] add option

---
 .../loader/layers/sanitizer/asan_interceptor.cpp | 16 +++++++++-------
 source/loader/layers/sanitizer/asan_options.cpp  |  1 +
 source/loader/layers/sanitizer/asan_options.hpp  |  1 +
 3 files changed, 11 insertions(+), 7 deletions(-)
diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp
index 4a315588fd..4219601895 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.cpp
@@ -837,13 +837,15 @@ ContextInfo::~ContextInfo() {
         getContext()->urDdiTable.Context.pfnRelease(Handle);
     assert(Result == UR_RESULT_SUCCESS);
 
-    // check memory leaks
-    std::vector<AllocationIterator> AllocInfos =
-        getContext()->interceptor->findAllocInfoByContext(Handle);
-    for (const auto &It : AllocInfos) {
-        const auto &[_, AI] = *It;
-        if (!AI->IsReleased) {
-            ReportMemoryLeak(AI);
+    if (getOptions().DetectLeaks) {
+        // check memory leaks
+        std::vector<AllocationIterator> AllocInfos =
+            getContext()->interceptor->findAllocInfoByContext(Handle);
+        for (const auto &It : AllocInfos) {
+            const auto &[_, AI] = *It;
+            if (!AI->IsReleased) {
+                ReportMemoryLeak(AI);
+            }
         }
     }
 }
diff --git a/source/loader/layers/sanitizer/asan_options.cpp b/source/loader/layers/sanitizer/asan_options.cpp
index 5c42ab8fca..e785cb37b7 100644
--- a/source/loader/layers/sanitizer/asan_options.cpp
+++ b/source/loader/layers/sanitizer/asan_options.cpp
@@ -85,6 +85,7 @@ AsanOptions::AsanOptions() {
     SetBoolOption("detect_locals", DetectLocals);
     SetBoolOption("detect_privates", DetectPrivates);
     SetBoolOption("print_stats", PrintStats);
+    SetBoolOption("detect_leaks", DetectLeaks);
 
     auto KV = OptionsEnvMap->find("quarantine_size_mb");
     if (KV != OptionsEnvMap->end()) {
diff --git a/source/loader/layers/sanitizer/asan_options.hpp b/source/loader/layers/sanitizer/asan_options.hpp
index 4c515e28fe..8a72a14cea 100644
--- a/source/loader/layers/sanitizer/asan_options.hpp
+++ b/source/loader/layers/sanitizer/asan_options.hpp
@@ -25,6 +25,7 @@ struct AsanOptions {
     bool DetectPrivates = true;
     bool PrintStats = false;
     bool DetectKernelArguments = true;
+    bool DetectLeaks = true;
 
     explicit AsanOptions();
 };

From cacc7696ef1045d35bab5c5e0d826debbcf981f3 Mon Sep 17 00:00:00 2001
From: "Wu, Yingcong" <yingcong.wu@intel.com>
Date: Mon, 28 Oct 2024 08:05:35 +0100
Subject: [PATCH 02/36] fix

---
 source/loader/layers/sanitizer/asan_interceptor.cpp | 2 +-
 source/loader/layers/sanitizer/asan_interceptor.hpp | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp
index 4219601895..01e37aecdd 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.cpp
@@ -837,7 +837,7 @@ ContextInfo::~ContextInfo() {
         getContext()->urDdiTable.Context.pfnRelease(Handle);
     assert(Result == UR_RESULT_SUCCESS);
 
-    if (getOptions().DetectLeaks) {
+    if (getContext()->interceptor->getOptions().DetectLeaks) {
         // check memory leaks
         std::vector<AllocationIterator> AllocInfos =
             getContext()->interceptor->findAllocInfoByContext(Handle);
diff --git a/source/loader/layers/sanitizer/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan_interceptor.hpp
index e5429acd56..90e0d385b1 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.hpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.hpp
@@ -291,6 +291,7 @@ class SanitizerInterceptor {
                                   std::shared_ptr<DeviceInfo> &DeviceInfo);
 
   private:
+    AsanOptions m_Options;
     std::unordered_map<ur_context_handle_t, std::shared_ptr<ContextInfo>>
         m_ContextMap;
     ur_shared_mutex m_ContextMapMutex;
@@ -316,8 +317,6 @@ class SanitizerInterceptor {
 
     std::unique_ptr<Quarantine> m_Quarantine;
 
-    AsanOptions m_Options;
-
     std::unordered_set<ur_adapter_handle_t> m_Adapters;
     ur_shared_mutex m_AdaptersMutex;
 };

From 830992005aa8428aee6319708fb26ea0a4cbb758 Mon Sep 17 00:00:00 2001
From: "Wu, Yingcong" <yingcong.wu@intel.com>
Date: Mon, 28 Oct 2024 09:27:31 +0100
Subject: [PATCH 03/36] add comment

---
 source/loader/layers/sanitizer/asan_interceptor.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/source/loader/layers/sanitizer/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan_interceptor.hpp
index 90e0d385b1..cc926ed545 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.hpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.hpp
@@ -291,6 +291,7 @@ class SanitizerInterceptor {
                                   std::shared_ptr<DeviceInfo> &DeviceInfo);
 
   private:
+    // m_Options may be used in other places, place it at the top
     AsanOptions m_Options;
     std::unordered_map<ur_context_handle_t, std::shared_ptr<ContextInfo>>
         m_ContextMap;

From d263f9d0e214c85e97afdba9aa52db5bbf6723ad Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Wed, 6 Nov 2024 00:07:16 +0100
Subject: [PATCH 04/36] [Common] fix parseDisjointPoolConfig

returning {} when EnableBuffers is false means
returning a defult-constructed DisjointPoolAllConfigs struct which
has EnableBuffers set to 1. Fix this by always returning properly
constructed DisjointPoolAllConfigs.
---
 source/common/umf_pools/disjoint_pool_config_parser.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/source/common/umf_pools/disjoint_pool_config_parser.cpp b/source/common/umf_pools/disjoint_pool_config_parser.cpp
index f1bb7cd40c..0e82072ae2 100644
--- a/source/common/umf_pools/disjoint_pool_config_parser.cpp
+++ b/source/common/umf_pools/disjoint_pool_config_parser.cpp
@@ -215,6 +215,8 @@ DisjointPoolAllConfigs parseDisjointPoolConfig(const std::string &config,
         }
     }
 
+    AllConfigs.EnableBuffers = EnableBuffers;
+
     AllConfigs.limits = std::shared_ptr<umf_disjoint_pool_shared_limits_t>(
         umfDisjointPoolSharedLimitsCreate(MaxSize),
         umfDisjointPoolSharedLimitsDestroy);
@@ -224,10 +226,6 @@ DisjointPoolAllConfigs parseDisjointPoolConfig(const std::string &config,
         Config.PoolTrace = trace;
     }
 
-    if (!EnableBuffers) {
-        return {};
-    }
-
     if (!trace) {
         return AllConfigs;
     }

From 7222f79b160681434250bac74f3aae0e4abc46f0 Mon Sep 17 00:00:00 2001
From: Victor Lomuller <victor@codeplay.com>
Date: Wed, 13 Nov 2024 15:50:28 +0000
Subject: [PATCH 05/36] Add new launch property to support
 work_group_scratch_memory

intel/llvm#15061 introduces a new property work_group_scratch_memory which allow the user to set a given amount of local memory to be used.

In order to pass this information to the adaptor, the patch adds a new launch property to urEnqueueKernelLaunchCustomExp.

The patch also changes the signature of urEnqueueKernelLaunchCustomExp to add global offset in order to maintain features when using this extension.

Signed-off-by: Victor Lomuller <victor@codeplay.com>
---
 include/ur_api.h                              |  15 ++-
 include/ur_ddi.h                              |   1 +
 include/ur_print.hpp                          |  16 +++
 scripts/core/exp-launch-properties.yml        |  13 +-
 source/adapters/cuda/enqueue.cpp              | 111 +++++++++++-------
 source/adapters/cuda/kernel.hpp               |  45 +++++--
 source/adapters/level_zero/queue.cpp          |   5 +-
 .../level_zero/ur_interface_loader.hpp        |   4 +-
 source/adapters/level_zero/v2/queue_api.cpp   |   6 +-
 source/adapters/level_zero/v2/queue_api.hpp   |   6 +-
 .../v2/queue_immediate_in_order.cpp           |   4 +-
 .../v2/queue_immediate_in_order.hpp           |   4 +-
 source/adapters/mock/ur_mockddi.cpp           |  23 +++-
 source/loader/layers/tracing/ur_trcddi.cpp    |  25 ++--
 source/loader/layers/validation/ur_valddi.cpp |  13 +-
 source/loader/ur_ldrddi.cpp                   |  35 +++++-
 source/loader/ur_libapi.cpp                   |  12 +-
 source/ur_api.cpp                             |   4 +
 .../launch_properties.cpp                     |   4 +-
 19 files changed, 252 insertions(+), 94 deletions(-)

diff --git a/include/ur_api.h b/include/ur_api.h
index 3205fcb207..8d8f03175a 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -9537,6 +9537,7 @@ typedef enum ur_exp_launch_property_id_t {
     UR_EXP_LAUNCH_PROPERTY_ID_IGNORE = 0,            ///< The property has no effect
     UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE = 1,       ///< Whether to launch a cooperative kernel
     UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION = 2, ///< work-group cluster dimensions
+    UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY = 3, ///< Implicit work group memory allocation
     /// @cond
     UR_EXP_LAUNCH_PROPERTY_ID_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -9550,10 +9551,12 @@ typedef enum ur_exp_launch_property_id_t {
 ///   _Analogues_
 ///     - **CUlaunchAttributeValue**
 typedef union ur_exp_launch_property_value_t {
-    uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
-                            ///< value must be a divisor of the corresponding global work-size
-                            ///< dimension (in units of work-group).
-    int cooperative;        ///< [in] non-zero value indicates a cooperative kernel
+    uint32_t clusterDim[3];    ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
+                               ///< value must be a divisor of the corresponding global work-size
+                               ///< dimension (in units of work-group).
+    int cooperative;           ///< [in] non-zero value indicates a cooperative kernel
+    size_t workgroup_mem_size; ///< [in] non-zero value indicates the amount of work group memory to
+                               ///< allocate in bytes
 
 } ur_exp_launch_property_value_t;
 
@@ -9594,6 +9597,7 @@ typedef struct ur_exp_launch_property_t {
 ///         + NULL == hQueue
 ///         + NULL == hKernel
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkOffset`
 ///         + `NULL == pGlobalWorkSize`
 ///         + `NULL == launchPropList`
 ///         + NULL == pGlobalWorkSize
@@ -9622,6 +9626,8 @@ urEnqueueKernelLaunchCustomExp(
     ur_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
     uint32_t workDim,                               ///< [in] number of dimensions, from 1 to 3, to specify the global and
                                                     ///< work-group work-items
+    const size_t *pGlobalWorkOffset,                ///< [in] pointer to an array of workDim unsigned values that specify the
+                                                    ///< offset used to calculate the global ID of a work-item
     const size_t *pGlobalWorkSize,                  ///< [in] pointer to an array of workDim unsigned values that specify the
                                                     ///< number of global work-items in workDim that will execute the kernel
                                                     ///< function
@@ -11531,6 +11537,7 @@ typedef struct ur_enqueue_kernel_launch_custom_exp_params_t {
     ur_queue_handle_t *phQueue;
     ur_kernel_handle_t *phKernel;
     uint32_t *pworkDim;
+    const size_t **ppGlobalWorkOffset;
     const size_t **ppGlobalWorkSize;
     const size_t **ppLocalWorkSize;
     uint32_t *pnumPropsInLaunchPropList;
diff --git a/include/ur_ddi.h b/include/ur_ddi.h
index 40a6c5c269..cdf90eda6d 100644
--- a/include/ur_ddi.h
+++ b/include/ur_ddi.h
@@ -1467,6 +1467,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchCustomExp_t)(
     uint32_t,
     const size_t *,
     const size_t *,
+    const size_t *,
     uint32_t,
     const ur_exp_launch_property_t *,
     uint32_t,
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index 1d28b8eac0..36eac7fe74 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -10319,6 +10319,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_exp_launch_property_id
     case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION:
         os << "UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION";
         break;
+    case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
+        os << "UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY";
+        break;
     default:
         os << "unknown enumerator";
         break;
@@ -10355,6 +10358,13 @@ inline ur_result_t printUnion(
 
         os << (params.cooperative);
 
+        break;
+    case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
+
+        os << ".workgroup_mem_size = ";
+
+        os << (params.workgroup_mem_size);
+
         break;
     default:
         os << "<unknown>";
@@ -15022,6 +15032,12 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
 
     os << *(params->pworkDim);
 
+    os << ", ";
+    os << ".pGlobalWorkOffset = ";
+
+    ur::details::printPtr(os,
+                          *(params->ppGlobalWorkOffset));
+
     os << ", ";
     os << ".pGlobalWorkSize = ";
 
diff --git a/scripts/core/exp-launch-properties.yml b/scripts/core/exp-launch-properties.yml
index 9e66e9ea06..ca28421815 100644
--- a/scripts/core/exp-launch-properties.yml
+++ b/scripts/core/exp-launch-properties.yml
@@ -29,6 +29,8 @@ etors:
       desc: "Whether to launch a cooperative kernel"
     - name: CLUSTER_DIMENSION
       desc: "work-group cluster dimensions"
+    - name: WORK_GROUP_MEMORY
+      desc: "Implicit work group memory allocation"
 --- #--------------------------------------------------------------------------
 type: union
 desc: "Specifies a launch property value"
@@ -45,6 +47,10 @@ members:
       name: cooperative
       desc: "[in] non-zero value indicates a cooperative kernel"
       tag: $X_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE
+    - type: size_t
+      name: workgroup_mem_size
+      desc: "[in] non-zero value indicates the amount of work group memory to allocate in bytes"
+      tag: $X_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY
 --- #--------------------------------------------------------------------------
 type: struct
 desc: "Kernel launch property"
@@ -82,6 +88,9 @@ params:
     - type: uint32_t
       name: workDim
       desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items"
+    - type: "const size_t*"
+      name: pGlobalWorkOffset
+      desc: "[in] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item"
     - type: const size_t*
       name: pGlobalWorkSize
       desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function"
@@ -97,10 +106,10 @@ params:
     - type: uint32_t
       name: numEventsInWaitList
       desc: "[in] size of the event wait list"
-    - type: const ur_event_handle_t*
+    - type: const $x_event_handle_t*
       name: phEventWaitList
       desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. "
-    - type: ur_event_handle_t*
+    - type: $x_event_handle_t*
       name: phEvent
       desc: "[out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array."
 returns:
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index fc3d0220e8..a3fc58dfb9 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -422,11 +422,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
                                         phEventWaitList, phEvent);
 }
 
-UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
-    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
-    const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
-    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+static ur_result_t
+enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
+                    uint32_t workDim, const size_t *pGlobalWorkOffset,
+                    const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
+                    uint32_t numEventsInWaitList,
+                    const ur_event_handle_t *phEventWaitList,
+                    ur_event_handle_t *phEvent, size_t WorkGroupMemory) {
   // Preconditions
   UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(),
             UR_RESULT_ERROR_INVALID_KERNEL);
@@ -444,6 +446,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
   size_t BlocksPerGrid[3] = {1u, 1u, 1u};
 
+  // Set work group memory so we can compute the whole memory requirement
+  if (WorkGroupMemory)
+    hKernel->setWorkGroupMemory(WorkGroupMemory);
   uint32_t LocalSize = hKernel->getLocalSize();
   CUfunction CuFunc = hKernel->get();
 
@@ -506,6 +511,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   return UR_RESULT_SUCCESS;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
+    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
+                             pGlobalWorkSize, pLocalWorkSize,
+                             numEventsInWaitList, phEventWaitList, phEvent,
+                             /*WorkGroupMemory=*/0);
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
     const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
@@ -516,8 +532,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
     coop_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE;
     coop_prop.value.cooperative = 1;
     return urEnqueueKernelLaunchCustomExp(
-        hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, 1,
-        &coop_prop, numEventsInWaitList, phEventWaitList, phEvent);
+        hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+        pLocalWorkSize, 1, &coop_prop, numEventsInWaitList, phEventWaitList,
+        phEvent);
   }
   return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
                                pGlobalWorkSize, pLocalWorkSize,
@@ -526,16 +543,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
-    uint32_t numPropsInLaunchPropList,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
     const ur_exp_launch_property_t *launchPropList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
 
-  if (numPropsInLaunchPropList == 0) {
-    urEnqueueKernelLaunch(hQueue, hKernel, workDim, nullptr, pGlobalWorkSize,
-                          pLocalWorkSize, numEventsInWaitList, phEventWaitList,
-                          phEvent);
+  size_t WorkGroupMemory = [&]() -> size_t {
+    const ur_exp_launch_property_t *WorkGroupMemoryProp = std::find_if(
+        launchPropList, launchPropList + numPropsInLaunchPropList,
+        [](const ur_exp_launch_property_t &Prop) {
+          return Prop.id == UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY;
+        });
+    if (WorkGroupMemoryProp != launchPropList + numPropsInLaunchPropList)
+      return WorkGroupMemoryProp->value.workgroup_mem_size;
+    return 0;
+  }();
+
+  if (numPropsInLaunchPropList == 0 ||
+      (WorkGroupMemory && numPropsInLaunchPropList == 1)) {
+    return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
+                               pGlobalWorkSize, pLocalWorkSize,
+                               numEventsInWaitList, phEventWaitList, phEvent,
+                               WorkGroupMemory);
   }
 #if CUDA_VERSION >= 11080
   // Preconditions
@@ -548,7 +578,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     return UR_RESULT_ERROR_INVALID_NULL_POINTER;
   }
 
-  std::vector<CUlaunchAttribute> launch_attribute(numPropsInLaunchPropList);
+  std::vector<CUlaunchAttribute> launch_attribute;
+  launch_attribute.reserve(numPropsInLaunchPropList);
 
   // Early exit for zero size kernel
   if (*pGlobalWorkSize == 0) {
@@ -561,40 +592,35 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
   size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
   size_t BlocksPerGrid[3] = {1u, 1u, 1u};
 
+  // Set work group memory so we can compute the whole memory requirement
+  if (WorkGroupMemory)
+    hKernel->setWorkGroupMemory(WorkGroupMemory);
   uint32_t LocalSize = hKernel->getLocalSize();
   CUfunction CuFunc = hKernel->get();
 
   for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) {
     switch (launchPropList[i].id) {
     case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: {
-      launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_IGNORE;
+      auto &attr = launch_attribute.emplace_back();
+      attr.id = CU_LAUNCH_ATTRIBUTE_IGNORE;
       break;
     }
     case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: {
-
-      launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
+      auto &attr = launch_attribute.emplace_back();
+      attr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
       // Note that cuda orders from right to left wrt SYCL dimensional order.
       if (workDim == 3) {
-        launch_attribute[i].value.clusterDim.x =
-            launchPropList[i].value.clusterDim[2];
-        launch_attribute[i].value.clusterDim.y =
-            launchPropList[i].value.clusterDim[1];
-        launch_attribute[i].value.clusterDim.z =
-            launchPropList[i].value.clusterDim[0];
+        attr.value.clusterDim.x = launchPropList[i].value.clusterDim[2];
+        attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1];
+        attr.value.clusterDim.z = launchPropList[i].value.clusterDim[0];
       } else if (workDim == 2) {
-        launch_attribute[i].value.clusterDim.x =
-            launchPropList[i].value.clusterDim[1];
-        launch_attribute[i].value.clusterDim.y =
-            launchPropList[i].value.clusterDim[0];
-        launch_attribute[i].value.clusterDim.z =
-            launchPropList[i].value.clusterDim[2];
+        attr.value.clusterDim.x = launchPropList[i].value.clusterDim[1];
+        attr.value.clusterDim.y = launchPropList[i].value.clusterDim[0];
+        attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2];
       } else {
-        launch_attribute[i].value.clusterDim.x =
-            launchPropList[i].value.clusterDim[0];
-        launch_attribute[i].value.clusterDim.y =
-            launchPropList[i].value.clusterDim[1];
-        launch_attribute[i].value.clusterDim.z =
-            launchPropList[i].value.clusterDim[2];
+        attr.value.clusterDim.x = launchPropList[i].value.clusterDim[0];
+        attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1];
+        attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2];
       }
 
       UR_CHECK_ERROR(cuFuncSetAttribute(
@@ -603,9 +629,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
       break;
     }
     case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: {
-      launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
-      launch_attribute[i].value.cooperative =
-          launchPropList[i].value.cooperative;
+      auto &attr = launch_attribute.emplace_back();
+      attr.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
+      attr.value.cooperative = launchPropList[i].value.cooperative;
+      break;
+    }
+    case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: {
       break;
     }
     default: {
@@ -618,8 +647,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
   // using the standard UR_CHECK_ERROR
   if (ur_result_t Ret =
           setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
-                          nullptr, pGlobalWorkSize, pLocalWorkSize, hKernel,
-                          CuFunc, ThreadsPerBlock, BlocksPerGrid);
+                          pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
+                          hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
       Ret != UR_RESULT_SUCCESS)
     return Ret;
 
@@ -667,7 +696,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     launch_config.sharedMemBytes = LocalSize;
     launch_config.hStream = CuStream;
     launch_config.attrs = &launch_attribute[0];
-    launch_config.numAttrs = numPropsInLaunchPropList;
+    launch_config.numAttrs = launch_attribute.size();
 
     UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc,
                                     const_cast<void **>(ArgIndices.data()),
diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp
index 7ad20a4f0e..c3b9fae09a 100644
--- a/source/adapters/cuda/kernel.hpp
+++ b/source/adapters/cuda/kernel.hpp
@@ -65,6 +65,8 @@ struct ur_kernel_handle_t_ {
     args_size_t ParamSizes;
     args_index_t Indices;
     args_size_t OffsetPerIndex;
+    size_t WorkGroupMemory = 0;
+
     // A struct to keep track of memargs so that we can do dependency analysis
     // at urEnqueueKernelLaunch
     struct mem_obj_arg {
@@ -105,22 +107,28 @@ struct ur_kernel_handle_t_ {
       OffsetPerIndex[Index] = LocalSize;
     }
 
-    void addLocalArg(size_t Index, size_t Size) {
-      size_t LocalOffset = this->getLocalSize();
-
-      // maximum required alignment is the size of the largest vector type
-      const size_t MaxAlignment = sizeof(double) * 16;
+    // maximum required alignment is the size of the largest vector type
+    static constexpr size_t MaxAlignment = sizeof(double) * 16;
 
+    static size_t alignMemoryAllocation(size_t Size, size_t Offset) {
       // for arguments smaller than the maximum alignment simply align to the
       // size of the argument
       const size_t Alignment = std::min(MaxAlignment, Size);
 
       // align the argument
-      size_t AlignedLocalOffset = LocalOffset;
-      size_t Pad = LocalOffset % Alignment;
+      size_t AlignedLocalOffset = Offset;
+      size_t Pad = Offset % Alignment;
       if (Pad != 0) {
         AlignedLocalOffset += Alignment - Pad;
       }
+      return AlignedLocalOffset;
+    }
+
+    void addLocalArg(size_t Index, size_t Size) {
+      size_t LocalOffset = this->getLocalSize();
+
+      // align the argument
+      size_t AlignedLocalOffset = alignMemoryAllocation(Size, LocalOffset);
 
       addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
              Size + (AlignedLocalOffset - LocalOffset));
@@ -140,6 +148,24 @@ struct ur_kernel_handle_t_ {
       MemObjArgs.push_back(arguments::mem_obj_arg{hMem, Index, Flags});
     }
 
+    void setWorkGroupMemory(size_t MemSize) {
+      assert(WorkGroupMemory == 0 &&
+             "Work Group Memory size can only be set once");
+      // Ensure first offset is MaxAlignment aligned
+      WorkGroupMemory = alignMemoryAllocation(MaxAlignment, MemSize);
+
+      // Adjust local accessor setting
+      // the dynamic memory will start at offset 0 (allows us to keep accessing
+      // local memory as a GV) and accessors will use the rest of the range
+      for (size_t i = 0; i < OffsetPerIndex.size(); i++) {
+        // if offset is 0, it is not a local accessor argument.
+        if (!OffsetPerIndex[i])
+          continue;
+        assert(ParamSizes[i] == sizeof(size_t) && "Offset should be a size_t");
+        *reinterpret_cast<size_t *>(Indices[i]) += WorkGroupMemory;
+      }
+    }
+
     void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) {
       assert(Size == sizeof(std::uint32_t) * 3);
       std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size);
@@ -147,13 +173,15 @@ struct ur_kernel_handle_t_ {
 
     void clearLocalSize() {
       std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0);
+      WorkGroupMemory = 0;
     }
 
     const args_index_t &getIndices() const noexcept { return Indices; }
 
     uint32_t getLocalSize() const {
       return std::accumulate(std::begin(OffsetPerIndex),
-                             std::end(OffsetPerIndex), 0);
+                             std::end(OffsetPerIndex), 0) +
+             WorkGroupMemory;
     }
   } Args;
 
@@ -238,6 +266,7 @@ struct ur_kernel_handle_t_ {
     return Args.getIndices();
   }
 
+  void setWorkGroupMemory(size_t MemSize) { Args.setWorkGroupMemory(MemSize); }
   uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); }
 
   void clearLocalSize() { Args.clearLocalSize(); }
diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index c4598f3472..95c8d026a7 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -902,14 +902,15 @@ ur_result_t urQueueFlush(
 
 ur_result_t urEnqueueKernelLaunchCustomExp(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
-    uint32_t numPropsInLaunchPropList,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
     const ur_exp_launch_property_t *launchPropList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
   std::ignore = hQueue;
   std::ignore = hKernel;
   std::ignore = workDim;
+  std::ignore = pGlobalWorkOffset;
   std::ignore = pGlobalWorkSize;
   std::ignore = pLocalWorkSize;
   std::ignore = numPropsInLaunchPropList;
diff --git a/source/adapters/level_zero/ur_interface_loader.hpp b/source/adapters/level_zero/ur_interface_loader.hpp
index 1215d6449e..0832303b50 100644
--- a/source/adapters/level_zero/ur_interface_loader.hpp
+++ b/source/adapters/level_zero/ur_interface_loader.hpp
@@ -694,8 +694,8 @@ ur_result_t urEnqueueTimestampRecordingExp(
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent);
 ur_result_t urEnqueueKernelLaunchCustomExp(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
-    uint32_t numPropsInLaunchPropList,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
     const ur_exp_launch_property_t *launchPropList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent);
diff --git a/source/adapters/level_zero/v2/queue_api.cpp b/source/adapters/level_zero/v2/queue_api.cpp
index b7b45625a2..e4659b5f2c 100644
--- a/source/adapters/level_zero/v2/queue_api.cpp
+++ b/source/adapters/level_zero/v2/queue_api.cpp
@@ -391,13 +391,13 @@ ur_result_t urEnqueueTimestampRecordingExp(
 }
 ur_result_t urEnqueueKernelLaunchCustomExp(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
-    uint32_t numPropsInLaunchPropList,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
     const ur_exp_launch_property_t *launchPropList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) try {
   return hQueue->enqueueKernelLaunchCustomExp(
-      hKernel, workDim, pGlobalWorkSize, pLocalWorkSize,
+      hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
       numPropsInLaunchPropList, launchPropList, numEventsInWaitList,
       phEventWaitList, phEvent);
 } catch (...) {
diff --git a/source/adapters/level_zero/v2/queue_api.hpp b/source/adapters/level_zero/v2/queue_api.hpp
index 7cb039ccdd..c59f084fc4 100644
--- a/source/adapters/level_zero/v2/queue_api.hpp
+++ b/source/adapters/level_zero/v2/queue_api.hpp
@@ -144,9 +144,9 @@ struct ur_queue_handle_t_ {
                                                    const ur_event_handle_t *,
                                                    ur_event_handle_t *) = 0;
   virtual ur_result_t enqueueKernelLaunchCustomExp(
-      ur_kernel_handle_t, uint32_t, const size_t *, const size_t *, uint32_t,
-      const ur_exp_launch_property_t *, uint32_t, const ur_event_handle_t *,
-      ur_event_handle_t *) = 0;
+      ur_kernel_handle_t, uint32_t, const size_t *, const size_t *,
+      const size_t *, uint32_t, const ur_exp_launch_property_t *, uint32_t,
+      const ur_event_handle_t *, ur_event_handle_t *) = 0;
   virtual ur_result_t
   enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *,
                                   uint32_t, const ur_event_handle_t *,
diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
index a77903525d..89fe57eb8d 100644
--- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
+++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
@@ -1067,13 +1067,15 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp(
 }
 
 ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunchCustomExp(
-    ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkSize,
+    ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
     const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
     const ur_exp_launch_property_t *launchPropList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
   std::ignore = hKernel;
   std::ignore = workDim;
+  std::ignore = pGlobalWorkOffset;
   std::ignore = pGlobalWorkSize;
   std::ignore = pLocalWorkSize;
   std::ignore = numPropsInLaunchPropList;
diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
index 33e060ded3..bdd3009d63 100644
--- a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
+++ b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
@@ -263,8 +263,8 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ {
                                ur_event_handle_t *phEvent) override;
   ur_result_t enqueueKernelLaunchCustomExp(
       ur_kernel_handle_t hKernel, uint32_t workDim,
-      const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
-      uint32_t numPropsInLaunchPropList,
+      const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+      const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
       const ur_exp_launch_property_t *launchPropList,
       uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
       ur_event_handle_t *phEvent) override;
diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp
index 42c342444d..c8ce408756 100644
--- a/source/adapters/mock/ur_mockddi.cpp
+++ b/source/adapters/mock/ur_mockddi.cpp
@@ -10126,6 +10126,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
@@ -10153,11 +10156,17 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     ur_result_t result = UR_RESULT_SUCCESS;
 
     ur_enqueue_kernel_launch_custom_exp_params_t params = {
-        &hQueue,          &hKernel,
-        &workDim,         &pGlobalWorkSize,
-        &pLocalWorkSize,  &numPropsInLaunchPropList,
-        &launchPropList,  &numEventsInWaitList,
-        &phEventWaitList, &phEvent};
+        &hQueue,
+        &hKernel,
+        &workDim,
+        &pGlobalWorkOffset,
+        &pGlobalWorkSize,
+        &pLocalWorkSize,
+        &numPropsInLaunchPropList,
+        &launchPropList,
+        &numEventsInWaitList,
+        &phEventWaitList,
+        &phEvent};
 
     auto beforeCallback = reinterpret_cast<ur_mock_callback_t>(
         mock::getCallbacks().get_before_callback(
@@ -10176,6 +10185,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
         result = replaceCallback(&params);
     } else {
 
+        // optional output handle
+        if (phEvent) {
+            *phEvent = mock::createDummyHandle<ur_event_handle_t>();
+        }
         result = UR_RESULT_SUCCESS;
     }
 
diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp
index 64489c39ac..afd1411ae8 100644
--- a/source/loader/layers/tracing/ur_trcddi.cpp
+++ b/source/loader/layers/tracing/ur_trcddi.cpp
@@ -8698,6 +8698,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
@@ -8730,11 +8733,17 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     }
 
     ur_enqueue_kernel_launch_custom_exp_params_t params = {
-        &hQueue,          &hKernel,
-        &workDim,         &pGlobalWorkSize,
-        &pLocalWorkSize,  &numPropsInLaunchPropList,
-        &launchPropList,  &numEventsInWaitList,
-        &phEventWaitList, &phEvent};
+        &hQueue,
+        &hKernel,
+        &workDim,
+        &pGlobalWorkOffset,
+        &pGlobalWorkSize,
+        &pLocalWorkSize,
+        &numPropsInLaunchPropList,
+        &launchPropList,
+        &numEventsInWaitList,
+        &phEventWaitList,
+        &phEvent};
     uint64_t instance =
         getContext()->notify_begin(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP,
                                    "urEnqueueKernelLaunchCustomExp", &params);
@@ -8743,9 +8752,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     logger.info("   ---> urEnqueueKernelLaunchCustomExp\n");
 
     ur_result_t result = pfnKernelLaunchCustomExp(
-        hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize,
-        numPropsInLaunchPropList, launchPropList, numEventsInWaitList,
-        phEventWaitList, phEvent);
+        hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+        pLocalWorkSize, numPropsInLaunchPropList, launchPropList,
+        numEventsInWaitList, phEventWaitList, phEvent);
 
     getContext()->notify_end(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP,
                              "urEnqueueKernelLaunchCustomExp", &params, &result,
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index 195c1d3c69..053d276f8c 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -9726,6 +9726,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
@@ -9766,6 +9769,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
             return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
         }
 
+        if (NULL == pGlobalWorkOffset) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
         if (NULL == pGlobalWorkSize) {
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
@@ -9794,9 +9801,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     }
 
     ur_result_t result = pfnKernelLaunchCustomExp(
-        hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize,
-        numPropsInLaunchPropList, launchPropList, numEventsInWaitList,
-        phEventWaitList, phEvent);
+        hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+        pLocalWorkSize, numPropsInLaunchPropList, launchPropList,
+        numEventsInWaitList, phEventWaitList, phEvent);
 
     return result;
 }
diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp
index 86a6ad95a0..602b8f1a82 100644
--- a/source/loader/ur_ldrddi.cpp
+++ b/source/loader/ur_ldrddi.cpp
@@ -8866,6 +8866,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
@@ -8908,11 +8911,35 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     // convert loader handle to platform handle
     hKernel = reinterpret_cast<ur_kernel_object_t *>(hKernel)->handle;
 
+    // convert loader handles to platform handles
+    auto phEventWaitListLocal =
+        std::vector<ur_event_handle_t>(numEventsInWaitList);
+    for (size_t i = 0; i < numEventsInWaitList; ++i) {
+        phEventWaitListLocal[i] =
+            reinterpret_cast<ur_event_object_t *>(phEventWaitList[i])->handle;
+    }
+
     // forward to device-platform
-    result = pfnKernelLaunchCustomExp(hQueue, hKernel, workDim, pGlobalWorkSize,
-                                      pLocalWorkSize, numPropsInLaunchPropList,
-                                      launchPropList, numEventsInWaitList,
-                                      phEventWaitList, phEvent);
+    result = pfnKernelLaunchCustomExp(
+        hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+        pLocalWorkSize, numPropsInLaunchPropList, launchPropList,
+        numEventsInWaitList, phEventWaitListLocal.data(), phEvent);
+
+    // In the event of ERROR_ADAPTER_SPECIFIC we should still attempt to wrap any output handles below.
+    if (UR_RESULT_SUCCESS != result &&
+        UR_RESULT_ERROR_ADAPTER_SPECIFIC != result) {
+        return result;
+    }
+    try {
+        // convert platform handle to loader handle
+        if (nullptr != phEvent) {
+            *phEvent = reinterpret_cast<ur_event_handle_t>(
+                context->factories.ur_event_factory.getInstance(*phEvent,
+                                                                dditable));
+        }
+    } catch (std::bad_alloc &) {
+        result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+    }
 
     return result;
 }
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index 8dca26d4ba..9fe178a998 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -8992,6 +8992,7 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
 ///         + NULL == hQueue
 ///         + NULL == hKernel
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkOffset`
 ///         + `NULL == pGlobalWorkSize`
 ///         + `NULL == launchPropList`
 ///         + NULL == pGlobalWorkSize
@@ -9020,6 +9021,9 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
@@ -9050,10 +9054,10 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
         return UR_RESULT_ERROR_UNINITIALIZED;
     }
 
-    return pfnKernelLaunchCustomExp(hQueue, hKernel, workDim, pGlobalWorkSize,
-                                    pLocalWorkSize, numPropsInLaunchPropList,
-                                    launchPropList, numEventsInWaitList,
-                                    phEventWaitList, phEvent);
+    return pfnKernelLaunchCustomExp(
+        hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+        pLocalWorkSize, numPropsInLaunchPropList, launchPropList,
+        numEventsInWaitList, phEventWaitList, phEvent);
 } catch (...) {
     return exceptionToResult(std::current_exception());
 }
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index 22c76f122e..9b2c177a85 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -7625,6 +7625,7 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
 ///         + NULL == hQueue
 ///         + NULL == hKernel
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkOffset`
 ///         + `NULL == pGlobalWorkSize`
 ///         + `NULL == launchPropList`
 ///         + NULL == pGlobalWorkSize
@@ -7653,6 +7654,9 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
diff --git a/test/conformance/exp_launch_properties/launch_properties.cpp b/test/conformance/exp_launch_properties/launch_properties.cpp
index a54a44ecaf..23ba56ff4b 100644
--- a/test/conformance/exp_launch_properties/launch_properties.cpp
+++ b/test/conformance/exp_launch_properties/launch_properties.cpp
@@ -95,8 +95,8 @@ TEST_P(urEnqueueKernelLaunchCustomTest, Success) {
     AddPodArg(val);
 
     ASSERT_SUCCESS(urEnqueueKernelLaunchCustomExp(
-        queue, kernel, n_dimensions, &global_size, nullptr, 1, &props[0], 0,
-        nullptr, nullptr));
+        queue, kernel, n_dimensions, &global_offset, &global_size, nullptr, 1,
+        &props[0], 0, nullptr, nullptr));
     ASSERT_SUCCESS(urQueueFinish(queue));
     ValidateBuffer(buffer, sizeof(val) * global_size, val);
 }

From c1cb2fcbd98d6e45dba3edf0b92a74b126a7fb30 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Stolarczuk?= <lukasz.stolarczuk@intel.com>
Date: Mon, 25 Nov 2024 12:24:44 +0100
Subject: [PATCH 06/36] [comon] Bump UMF version

In comparison to the previous version it contains only a few
extra UMF PRs. This version is not taken from UMF's main branch,
but rather from special 'umf-fixes-nov-pulldown'.
---
 source/common/CMakeLists.txt | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index df3fb95262..e24d987567 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -32,8 +32,11 @@ if (NOT DEFINED UMF_REPO)
 endif()
 
 if (NOT DEFINED UMF_TAG)
-    # main 28.10.2024: Merge pull request #832 ...
-    set(UMF_TAG 43e9af0f50b70ccb989f786243881035dd829203)
+    # special branch with cherry-picks for incoming pulldown
+    # contains UMF PRs: #866, #924, and #930
+    # branch was based on commit: 3bae087c9a8c0cbed5bde40f0d5a2
+    # umf-fixes-nov-pulldown: 25.11.2024: Disable libudev in hwloc builds
+    set(UMF_TAG a7b6152b7b095c88ddf34bc7d442eb4c2b3f74d6)
 endif()
 
 message(STATUS "Will fetch Unified Memory Framework from ${UMF_REPO}")

From 9950396a45ba15beb548f9aecae77aa8faa89783 Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Wed, 20 Nov 2024 21:06:51 +0000
Subject: [PATCH 07/36] [Benchmarks] support CPU counter measurements

for api_overhead_benchmark_ur
---
 scripts/benchmarks/benches/base.py      |  3 --
 scripts/benchmarks/benches/compute.py   | 42 ++++++++++++++++---------
 scripts/benchmarks/benches/llamacpp.py  |  5 +--
 scripts/benchmarks/benches/result.py    |  2 +-
 scripts/benchmarks/benches/syclbench.py |  6 ++--
 scripts/benchmarks/benches/test.py      |  5 +--
 scripts/benchmarks/benches/velocity.py  | 35 ++++++---------------
 scripts/benchmarks/main.py              |  3 +-
 8 files changed, 43 insertions(+), 58 deletions(-)

diff --git a/scripts/benchmarks/benches/base.py b/scripts/benchmarks/benches/base.py
index 84e1b8287c..feeaa568b6 100644
--- a/scripts/benchmarks/benches/base.py
+++ b/scripts/benchmarks/benches/base.py
@@ -57,9 +57,6 @@ def download(self, name, url, file, untar = False):
     def name(self):
         raise NotImplementedError()
 
-    def unit(self):
-        raise NotImplementedError()
-
     def lower_is_better(self):
         return True
 
diff --git a/scripts/benchmarks/benches/compute.py b/scripts/benchmarks/benches/compute.py
index 57bed7624a..74562b550c 100644
--- a/scripts/benchmarks/benches/compute.py
+++ b/scripts/benchmarks/benches/compute.py
@@ -19,7 +19,7 @@ def setup(self):
         if options.sycl is None:
             return
 
-        repo_path = git_clone(self.directory, "compute-benchmarks-repo", "https://github.com/intel/compute-benchmarks.git", "c80ddec9f0b4905bcbeb0f264f710093dc70340d")
+        repo_path = git_clone(self.directory, "compute-benchmarks-repo", "https://github.com/intel/compute-benchmarks.git", "df38bc342641d7e83fbb4fe764a23d21d734e07b")
         build_path = create_build_path(self.directory, 'compute-benchmarks-build')
 
         configure_command = [
@@ -77,6 +77,13 @@ def benchmarks(self) -> list[Benchmark]:
 
         return benches
 
+def parse_unit_type(compute_unit):
+    if "[count]" in compute_unit:
+        return "instr"
+    elif "[us]" in compute_unit:
+        return "μs"
+    return "unknown"
+
 class ComputeBenchmark(Benchmark):
     def __init__(self, bench, name, test):
         self.bench = bench
@@ -90,9 +97,6 @@ def bin_args(self) -> list[str]:
     def extra_env_vars(self) -> dict:
         return {}
 
-    def unit(self):
-        return "μs"
-
     def setup(self):
         self.benchmark_bin = os.path.join(self.bench.directory, 'compute-benchmarks-build', 'bin', self.bench_name)
 
@@ -108,22 +112,32 @@ def run(self, env_vars) -> list[Result]:
         env_vars.update(self.extra_env_vars())
 
         result = self.run_bench(command, env_vars)
-        (label, mean) = self.parse_output(result)
-        return [ Result(label=self.name(), value=mean, command=command, env=env_vars, stdout=result) ]
+        parsed_results = self.parse_output(result)
+        ret = []
+        for label, mean, unit in parsed_results:
+            extra_label = " CPU count" if parse_unit_type(unit) == "CPU count" else ""
+            ret.append(Result(label=self.name() + extra_label, value=mean, command=command, env=env_vars, stdout=result, unit=parse_unit_type(unit)))
+        return ret
 
     def parse_output(self, output):
         csv_file = io.StringIO(output)
         reader = csv.reader(csv_file)
         next(reader, None)
-        data_row = next(reader, None)
-        if data_row is None:
+        results = []
+        while True:
+            data_row = next(reader, None)
+            if data_row is None:
+                break
+            try:
+                label = data_row[0]
+                mean = float(data_row[1])
+                unit = data_row[7]
+                results.append((label, mean, unit))
+            except (ValueError, IndexError) as e:
+                raise ValueError(f"Error parsing output: {e}")
+        if len(results) == 0:
             raise ValueError("Benchmark output does not contain data.")
-        try:
-            label = data_row[0]
-            mean = float(data_row[1])
-            return (label, mean)
-        except (ValueError, IndexError) as e:
-            raise ValueError(f"Error parsing output: {e}")
+        return results
 
     def teardown(self):
         return
diff --git a/scripts/benchmarks/benches/llamacpp.py b/scripts/benchmarks/benches/llamacpp.py
index 3ff7963bd1..50dd8d04c6 100644
--- a/scripts/benchmarks/benches/llamacpp.py
+++ b/scripts/benchmarks/benches/llamacpp.py
@@ -138,9 +138,6 @@ def __init__(self, bench):
         self.bench = bench
         super().__init__(bench.directory)
 
-    def unit(self):
-        return "token/s"
-
     def setup(self):
         self.benchmark_bin = os.path.join(self.bench.build_path, 'bin', 'llama-bench')
 
@@ -171,7 +168,7 @@ def run(self, env_vars) -> list[Result]:
         for r in parsed:
             (extra_label, mean) = r
             label = f"{self.name()} {extra_label}"
-            results.append(Result(label=label, value=mean, command=command, env=env_vars, stdout=result))
+            results.append(Result(label=label, value=mean, command=command, env=env_vars, stdout=result, unit="token/s"))
         return results
 
     def parse_output(self, output):
diff --git a/scripts/benchmarks/benches/result.py b/scripts/benchmarks/benches/result.py
index 07ee70148a..7d40040607 100644
--- a/scripts/benchmarks/benches/result.py
+++ b/scripts/benchmarks/benches/result.py
@@ -17,8 +17,8 @@ class Result:
     env: str
     stdout: str
     passed: bool = True
-    # values should not be set by the benchmark
     unit: str = ""
+    # values should not be set by the benchmark
     name: str = ""
     lower_is_better: bool = True
     git_hash: str = ''
diff --git a/scripts/benchmarks/benches/syclbench.py b/scripts/benchmarks/benches/syclbench.py
index fbfd009935..588f3ce998 100644
--- a/scripts/benchmarks/benches/syclbench.py
+++ b/scripts/benchmarks/benches/syclbench.py
@@ -99,9 +99,6 @@ def bin_args(self) -> list[str]:
     def extra_env_vars(self) -> dict:
         return {}
 
-    def unit(self):
-        return "ms"
-
     def setup(self):
         self.benchmark_bin = os.path.join(self.directory, 'sycl-bench-build', self.bench_name)
 
@@ -134,7 +131,8 @@ def run(self, env_vars) -> list[Result]:
                             passed=(row[1]=="PASS"),
                             command=command,
                             env=env_vars,
-                            stdout=row))
+                            stdout=row,
+                            unit="ms"))
         self.done = True
         return res_list
 
diff --git a/scripts/benchmarks/benches/test.py b/scripts/benchmarks/benches/test.py
index 88bc29a649..802688f032 100644
--- a/scripts/benchmarks/benches/test.py
+++ b/scripts/benchmarks/benches/test.py
@@ -49,9 +49,6 @@ def __init__(self, name, value, diff):
     def name(self):
         return self.bname
 
-    def unit(self):
-        return "ms"
-
     def lower_is_better(self):
         return True
 
@@ -61,7 +58,7 @@ def setup(self):
     def run(self, env_vars) -> list[Result]:
         random_value = self.value + random.uniform(-1 * (self.diff), self.diff)
         return [
-            Result(label=self.name(), value=random_value, command="", env={"A": "B"}, stdout="no output")
+            Result(label=self.name(), value=random_value, command="", env={"A": "B"}, stdout="no output", unit="ms")
         ]
 
     def teardown(self):
diff --git a/scripts/benchmarks/benches/velocity.py b/scripts/benchmarks/benches/velocity.py
index 856fd993db..605cf03fd4 100644
--- a/scripts/benchmarks/benches/velocity.py
+++ b/scripts/benchmarks/benches/velocity.py
@@ -39,11 +39,12 @@ def benchmarks(self) -> list[Benchmark]:
         ]
 
 class VelocityBase(Benchmark):
-    def __init__(self, name: str, bin_name: str, vb: VelocityBench):
+    def __init__(self, name: str, bin_name: str, vb: VelocityBench, unit: str):
         super().__init__(vb.directory)
         self.vb = vb
         self.bench_name = name
         self.bin_name = bin_name
+        self.unit = unit
         self.code_path = os.path.join(self.vb.repo_path, self.bench_name, 'SYCL')
 
     def download_deps(self):
@@ -83,21 +84,18 @@ def run(self, env_vars) -> list[Result]:
 
         result = self.run_bench(command, env_vars)
 
-        return [ Result(label=self.name(), value=self.parse_output(result), command=command, env=env_vars, stdout=result) ]
+        return [ Result(label=self.name(), value=self.parse_output(result), command=command, env=env_vars, stdout=result, unit=self.unit) ]
 
     def teardown(self):
         return
 
 class Hashtable(VelocityBase):
     def __init__(self, vb: VelocityBench):
-        super().__init__("hashtable", "hashtable_sycl", vb)
+        super().__init__("hashtable", "hashtable_sycl", vb, "M keys/sec")
 
     def name(self):
         return "Velocity-Bench Hashtable"
 
-    def unit(self):
-        return "M keys/sec"
-
     def bin_args(self) -> list[str]:
         return ["--no-verify"]
 
@@ -114,15 +112,12 @@ def parse_output(self, stdout: str) -> float:
 
 class Bitcracker(VelocityBase):
     def __init__(self, vb: VelocityBench):
-        super().__init__("bitcracker", "bitcracker", vb)
+        super().__init__("bitcracker", "bitcracker", vb, "s")
         self.data_path = os.path.join(vb.repo_path, "bitcracker", "hash_pass")
 
     def name(self):
         return "Velocity-Bench Bitcracker"
 
-    def unit(self):
-        return "s"
-
     def bin_args(self) -> list[str]:
         return ["-f", f"{self.data_path}/img_win8_user_hash.txt",
                 "-d", f"{self.data_path}/user_passwords_60000.txt",
@@ -137,7 +132,7 @@ def parse_output(self, stdout: str) -> float:
 
 class SobelFilter(VelocityBase):
     def __init__(self, vb: VelocityBench):
-        super().__init__("sobel_filter", "sobel_filter", vb)
+        super().__init__("sobel_filter", "sobel_filter", vb, "ms")
 
     def download_deps(self):
         self.download("sobel_filter", "https://github.com/oneapi-src/Velocity-Bench/raw/main/sobel_filter/res/sobel_filter_data.tgz?download=", "sobel_filter_data.tgz", untar=True)
@@ -146,9 +141,6 @@ def download_deps(self):
     def name(self):
         return "Velocity-Bench Sobel Filter"
 
-    def unit(self):
-        return "ms"
-
     def bin_args(self) -> list[str]:
         return ["-i", f"{self.data_path}/sobel_filter_data/silverfalls_32Kx32K.png",
                 "-n", "5"]
@@ -166,7 +158,7 @@ def parse_output(self, stdout: str) -> float:
 
 class QuickSilver(VelocityBase):
     def __init__(self, vb: VelocityBench):
-        super().__init__("QuickSilver", "qs", vb)
+        super().__init__("QuickSilver", "qs", vb, "MMS/CTT")
         self.data_path = os.path.join(vb.repo_path, "QuickSilver", "Examples", "AllScattering")
 
     def run(self, env_vars) -> list[Result]:
@@ -179,9 +171,6 @@ def run(self, env_vars) -> list[Result]:
     def name(self):
         return "Velocity-Bench QuickSilver"
 
-    def unit(self):
-        return "MMS/CTT"
-
     def lower_is_better(self):
         return False
 
@@ -200,7 +189,7 @@ def parse_output(self, stdout: str) -> float:
 
 class Easywave(VelocityBase):
     def __init__(self, vb: VelocityBench):
-        super().__init__("easywave", "easyWave_sycl", vb)
+        super().__init__("easywave", "easyWave_sycl", vb, "ms")
 
     def download_deps(self):
         self.download("easywave", "https://git.gfz-potsdam.de/id2/geoperil/easyWave/-/raw/master/data/examples.tar.gz", "examples.tar.gz", untar=True)
@@ -208,9 +197,6 @@ def download_deps(self):
     def name(self):
         return "Velocity-Bench Easywave"
 
-    def unit(self):
-        return "ms"
-
     def bin_args(self) -> list[str]:
         return ["-grid", f"{self.data_path}/examples/e2Asean.grd",
                 "-source", f"{self.data_path}/examples/BengkuluSept2007.flt",
@@ -245,7 +231,7 @@ def parse_output(self, stdout: str) -> float:
 
 class CudaSift(VelocityBase):
     def __init__(self, vb: VelocityBench):
-        super().__init__("cudaSift", "cudaSift", vb)
+        super().__init__("cudaSift", "cudaSift", vb, "ms")
 
     def download_deps(self):
         images = os.path.join(self.vb.repo_path, self.bench_name, 'inputData')
@@ -256,9 +242,6 @@ def download_deps(self):
     def name(self):
         return "Velocity-Bench CudaSift"
 
-    def unit(self):
-        return "ms"
-
     def parse_output(self, stdout: str) -> float:
         match = re.search(r'Avg workload time = (\d+\.\d+) ms', stdout)
         if match:
diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py
index 9dd77f14b2..c83825c9e5 100755
--- a/scripts/benchmarks/main.py
+++ b/scripts/benchmarks/main.py
@@ -73,7 +73,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
                 if bench_results is not None:
                     for bench_result in bench_results:
                         if bench_result.passed:
-                            print(f"complete ({bench_result.label}: {bench_result.value:.3f} {benchmark.unit()}).")
+                            print(f"complete ({bench_result.label}: {bench_result.value:.3f} {bench_result.unit}).")
                         else:
                             print(f"complete ({bench_result.label}: verification FAILED)")
                         iteration_results.append(bench_result)
@@ -91,7 +91,6 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
                     median_index = len(label_results) // 2
                     median_result = label_results[median_index]
 
-                    median_result.unit = benchmark.unit()
                     median_result.name = label
                     median_result.lower_is_better = benchmark.lower_is_better()
 

From 4a323ed755f0d1c4bad80a59f627da269ff9a8a2 Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Fri, 22 Nov 2024 18:03:23 +0000
Subject: [PATCH 08/36] [Benchmarks] add missing option to StreamMemory
 benchmark

---
 scripts/benchmarks/benches/compute.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/benchmarks/benches/compute.py b/scripts/benchmarks/benches/compute.py
index 74562b550c..f872399e9e 100644
--- a/scripts/benchmarks/benches/compute.py
+++ b/scripts/benchmarks/benches/compute.py
@@ -263,6 +263,7 @@ def bin_args(self) -> list[str]:
             f"--memoryPlacement={self.placement}",
             "--useEvents=0",
             "--contents=Zeros",
+            "--multiplier=1",
         ]
 
 class VectorSum(ComputeBenchmark):

From 31210f0431ca397d879395bad00292490f7f7f95 Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Mon, 25 Nov 2024 22:42:54 +0000
Subject: [PATCH 09/36] [Benchmarks] Add MemcpyExecute scenario with high ops
 count

to measure API overhead
---
 scripts/benchmarks/benches/compute.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/scripts/benchmarks/benches/compute.py b/scripts/benchmarks/benches/compute.py
index 57bed7624a..8916f52f53 100644
--- a/scripts/benchmarks/benches/compute.py
+++ b/scripts/benchmarks/benches/compute.py
@@ -59,14 +59,16 @@ def benchmarks(self) -> list[Benchmark]:
             ExecImmediateCopyQueue(self, 0, 1, 'Device', 'Device', 1024),
             ExecImmediateCopyQueue(self, 1, 1, 'Device', 'Host', 1024),
             VectorSum(self),
-            MemcpyExecute(self, 400, 1, 102400, 10, 1, 1),
-            MemcpyExecute(self, 100, 8, 102400, 10, 1, 1),
-            MemcpyExecute(self, 400, 8, 1024, 1000, 1, 1),
-            MemcpyExecute(self, 10, 16, 1024, 10000, 1, 1),
-            MemcpyExecute(self, 400, 1, 102400, 10, 0, 1),
-            MemcpyExecute(self, 100, 8, 102400, 10, 0, 1),
-            MemcpyExecute(self, 400, 8, 1024, 1000, 0, 1),
-            MemcpyExecute(self, 10, 16, 1024, 10000, 0, 1),
+            MemcpyExecute(self, 400, 1, 102400, 10, 1, 1, 1),
+            MemcpyExecute(self, 100, 8, 102400, 10, 1, 1, 1),
+            MemcpyExecute(self, 400, 8, 1024, 1000, 1, 1, 1),
+            MemcpyExecute(self, 10, 16, 1024, 10000, 1, 1, 1),
+            MemcpyExecute(self, 400, 1, 102400, 10, 0, 1, 1),
+            MemcpyExecute(self, 100, 8, 102400, 10, 0, 1, 1),
+            MemcpyExecute(self, 400, 8, 1024, 1000, 0, 1, 1),
+            MemcpyExecute(self, 10, 16, 1024, 10000, 0, 1, 1),
+            MemcpyExecute(self, 4096, 1, 1024, 10, 0, 1, 0),
+            MemcpyExecute(self, 4096, 4, 1024, 10, 0, 1, 0),
         ]
 
         if options.ur is not None:
@@ -267,22 +269,23 @@ def bin_args(self) -> list[str]:
         ]
 
 class MemcpyExecute(ComputeBenchmark):
-    def __init__(self, bench, numOpsPerThread, numThreads, allocSize, iterations, srcUSM, dstUSM):
+    def __init__(self, bench, numOpsPerThread, numThreads, allocSize, iterations, srcUSM, dstUSM, useEvent):
         self.numOpsPerThread = numOpsPerThread
         self.numThreads = numThreads
         self.allocSize = allocSize
         self.iterations = iterations
         self.srcUSM = srcUSM
         self.dstUSM = dstUSM
+        self.useEvents = useEvent
         super().__init__(bench, "multithread_benchmark_ur", "MemcpyExecute")
 
     def name(self):
-        return f"multithread_benchmark_ur MemcpyExecute opsPerThread:{self.numOpsPerThread}, numThreads:{self.numThreads}, allocSize:{self.allocSize} srcUSM:{self.srcUSM} dstUSM:{self.dstUSM}"
+        return f"multithread_benchmark_ur MemcpyExecute opsPerThread:{self.numOpsPerThread}, numThreads:{self.numThreads}, allocSize:{self.allocSize} srcUSM:{self.srcUSM} dstUSM:{self.dstUSM}" + (" without events" if not self.useEvents else "")
 
     def bin_args(self) -> list[str]:
         return [
             "--Ioq=1",
-            "--UseEvents=1",
+            f"--UseEvents={self.useEvents}",
             "--MeasureCompletion=1",
             "--UseQueuePerThread=1",
             f"--AllocSize={self.allocSize}",

From 6e0bdeb93483c165f3f0e926f6b2cd3edc00d0a7 Mon Sep 17 00:00:00 2001
From: Ben Tracy <ben.tracy@codeplay.com>
Date: Wed, 20 Nov 2024 13:27:14 +0000
Subject: [PATCH 10/36] [CMDBUF] Implement kernel binary update for L0 adapter

- Implement binary update in L0 adapter
- Platform and driver query support for new extension functions
- Update command buffer documentation to specify arg requirements for binary update
---
 scripts/core/EXP-COMMAND-BUFFER.rst           |   5 +
 source/adapters/level_zero/command_buffer.cpp | 143 ++++++++++++++----
 source/adapters/level_zero/command_buffer.hpp |  21 ++-
 source/adapters/level_zero/device.cpp         |   4 +
 source/adapters/level_zero/platform.cpp       |  31 ++++
 source/adapters/level_zero/platform.hpp       |   6 +
 6 files changed, 173 insertions(+), 37 deletions(-)

diff --git a/scripts/core/EXP-COMMAND-BUFFER.rst b/scripts/core/EXP-COMMAND-BUFFER.rst
index d6ef76c7bc..2313afd036 100644
--- a/scripts/core/EXP-COMMAND-BUFFER.rst
+++ b/scripts/core/EXP-COMMAND-BUFFER.rst
@@ -256,6 +256,11 @@ ${x}CommandBufferAppendKernelLaunchExp. The command can then be updated
 to use the new kernel handle by passing it to
 ${x}CommandBufferUpdateKernelLaunchExp.
 
+.. important::
+    When updating the kernel handle of a command all required arguments to the
+    new kernel must be provided in the update descriptor. Failure to do so will
+    result in undefined behavior.
+
 .. parsed-literal::
 
     // Create a command-buffer with update enabled.
diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index 56c53b5331..eccdc5e4d2 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -476,21 +476,14 @@ void ur_exp_command_buffer_handle_t_::cleanupCommandBufferResources() {
 
 ur_exp_command_buffer_command_handle_t_::
     ur_exp_command_buffer_command_handle_t_(
-        ur_exp_command_buffer_handle_t CommandBuffer, uint64_t CommandId,
-        uint32_t WorkDim, bool UserDefinedLocalSize,
-        ur_kernel_handle_t Kernel = nullptr)
-    : CommandBuffer(CommandBuffer), CommandId(CommandId), WorkDim(WorkDim),
-      UserDefinedLocalSize(UserDefinedLocalSize), Kernel(Kernel) {
+        ur_exp_command_buffer_handle_t CommandBuffer, uint64_t CommandId)
+    : CommandBuffer(CommandBuffer), CommandId(CommandId) {
   ur::level_zero::urCommandBufferRetainExp(CommandBuffer);
-  if (Kernel)
-    ur::level_zero::urKernelRetain(Kernel);
 }
 
 ur_exp_command_buffer_command_handle_t_::
     ~ur_exp_command_buffer_command_handle_t_() {
   ur::level_zero::urCommandBufferReleaseExp(CommandBuffer);
-  if (Kernel)
-    ur::level_zero::urKernelRelease(Kernel);
 }
 
 void ur_exp_command_buffer_handle_t_::registerSyncPoint(
@@ -527,6 +520,31 @@ ur_result_t ur_exp_command_buffer_handle_t_::getFenceForQueue(
   return UR_RESULT_SUCCESS;
 }
 
+kernel_command_handle::kernel_command_handle(
+    ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel,
+    uint64_t CommandId, uint32_t WorkDim, bool UserDefinedLocalSize,
+    uint32_t NumKernelAlternatives, ur_kernel_handle_t *KernelAlternatives)
+    : ur_exp_command_buffer_command_handle_t_(CommandBuffer, CommandId),
+      WorkDim(WorkDim), UserDefinedLocalSize(UserDefinedLocalSize),
+      Kernel(Kernel) {
+  // Add the default kernel to the list of valid kernels
+  ur::level_zero::urKernelRetain(Kernel);
+  ValidKernelHandles.insert(Kernel);
+  // Add alternative kernels if provided
+  if (KernelAlternatives) {
+    for (size_t i = 0; i < NumKernelAlternatives; i++) {
+      ur::level_zero::urKernelRetain(KernelAlternatives[i]);
+      ValidKernelHandles.insert(KernelAlternatives[i]);
+    }
+  }
+}
+
+kernel_command_handle::~kernel_command_handle() {
+  for (const ur_kernel_handle_t &KernelHandle : ValidKernelHandles) {
+    ur::level_zero::urKernelRelease(KernelHandle);
+  }
+}
+
 namespace ur::level_zero {
 
 /**
@@ -906,7 +924,8 @@ setKernelPendingArguments(ur_exp_command_buffer_handle_t CommandBuffer,
 ur_result_t
 createCommandHandle(ur_exp_command_buffer_handle_t CommandBuffer,
                     ur_kernel_handle_t Kernel, uint32_t WorkDim,
-                    const size_t *LocalWorkSize,
+                    const size_t *LocalWorkSize, uint32_t NumKernelAlternatives,
+                    ur_kernel_handle_t *KernelAlternatives,
                     ur_exp_command_buffer_command_handle_t &Command) {
 
   assert(CommandBuffer->IsUpdatable);
@@ -923,14 +942,41 @@ createCommandHandle(ur_exp_command_buffer_handle_t CommandBuffer,
                                ZE_MUTABLE_COMMAND_EXP_FLAG_GLOBAL_OFFSET;
 
   auto Platform = CommandBuffer->Context->getPlatform();
-  ZE2UR_CALL(Platform->ZeMutableCmdListExt.zexCommandListGetNextCommandIdExp,
-             (CommandBuffer->ZeComputeCommandListTranslated,
-              &ZeMutableCommandDesc, &CommandId));
+  if (NumKernelAlternatives > 0) {
+    ZeMutableCommandDesc.flags |=
+        ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_INSTRUCTION;
+
+    std::vector<ze_kernel_handle_t> TranslatedKernelHandles(
+        NumKernelAlternatives + 1, nullptr);
+
+    // Translate main kernel first
+    ZE2UR_CALL(zelLoaderTranslateHandle,
+               (ZEL_HANDLE_KERNEL, Kernel->ZeKernel,
+                (void **)&TranslatedKernelHandles[0]));
+
+    for (size_t i = 0; i < NumKernelAlternatives; i++) {
+      ZE2UR_CALL(zelLoaderTranslateHandle,
+                 (ZEL_HANDLE_KERNEL, KernelAlternatives[i]->ZeKernel,
+                  (void **)&TranslatedKernelHandles[i + 1]));
+    }
+
+    ZE2UR_CALL(Platform->ZeMutableCmdListExt
+                   .zexCommandListGetNextCommandIdWithKernelsExp,
+               (CommandBuffer->ZeComputeCommandListTranslated,
+                &ZeMutableCommandDesc, NumKernelAlternatives + 1,
+                TranslatedKernelHandles.data(), &CommandId));
+
+  } else {
+    ZE2UR_CALL(Platform->ZeMutableCmdListExt.zexCommandListGetNextCommandIdExp,
+               (CommandBuffer->ZeComputeCommandListTranslated,
+                &ZeMutableCommandDesc, &CommandId));
+  }
   DEBUG_LOG(CommandId);
 
   try {
-    Command = new ur_exp_command_buffer_command_handle_t_(
-        CommandBuffer, CommandId, WorkDim, LocalWorkSize != nullptr, Kernel);
+    Command = new kernel_command_handle(
+        CommandBuffer, Kernel, CommandId, WorkDim, LocalWorkSize != nullptr,
+        NumKernelAlternatives, KernelAlternatives);
   } catch (const std::bad_alloc &) {
     return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
   } catch (...) {
@@ -944,8 +990,7 @@ ur_result_t urCommandBufferAppendKernelLaunchExp(
     ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel,
     uint32_t WorkDim, const size_t *GlobalWorkOffset,
     const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
-    uint32_t /*numKernelAlternatives*/,
-    ur_kernel_handle_t * /*phKernelAlternatives*/,
+    uint32_t NumKernelAlternatives, ur_kernel_handle_t *KernelAlternatives,
     uint32_t NumSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *SyncPointWaitList,
     uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList,
@@ -960,6 +1005,10 @@ ur_result_t urCommandBufferAppendKernelLaunchExp(
   UR_ASSERT(!(Command && !CommandBuffer->IsUpdatable),
             UR_RESULT_ERROR_INVALID_OPERATION);
 
+  for (uint32_t i = 0; i < NumKernelAlternatives; ++i) {
+    UR_ASSERT(KernelAlternatives[i] != Kernel, UR_RESULT_ERROR_INVALID_VALUE);
+  }
+
   // Lock automatically releases when this goes out of scope.
   std::scoped_lock<ur_shared_mutex, ur_shared_mutex, ur_shared_mutex> Lock(
       Kernel->Mutex, Kernel->Program->Mutex, CommandBuffer->Mutex);
@@ -983,18 +1032,21 @@ ur_result_t urCommandBufferAppendKernelLaunchExp(
   ZE2UR_CALL(zeKernelSetGroupSize, (Kernel->ZeKernel, WG[0], WG[1], WG[2]));
 
   CommandBuffer->KernelsList.push_back(Kernel);
+  for (size_t i = 0; i < NumKernelAlternatives; i++) {
+    CommandBuffer->KernelsList.push_back(KernelAlternatives[i]);
+  }
 
-  // Increment the reference count of the Kernel and indicate that the Kernel
-  // is in use. Once the event has been signaled, the code in
-  // CleanupCompletedEvent(Event) will do a urKernelRelease to update the
-  // reference count on the kernel, using the kernel saved in CommandData.
-  UR_CALL(ur::level_zero::urKernelRetain(Kernel));
+  ur::level_zero::urKernelRetain(Kernel);
+  // Retain alternative kernels if provided
+  for (size_t i = 0; i < NumKernelAlternatives; i++) {
+    ur::level_zero::urKernelRetain(KernelAlternatives[i]);
+  }
 
   if (Command) {
     UR_CALL(createCommandHandle(CommandBuffer, Kernel, WorkDim, LocalWorkSize,
+                                NumKernelAlternatives, KernelAlternatives,
                                 *Command));
   }
-
   std::vector<ze_event_handle_t> ZeEventList;
   ze_event_handle_t ZeLaunchEvent = nullptr;
   UR_CALL(createSyncPointAndGetZeEvents(
@@ -1690,7 +1742,7 @@ ur_result_t urCommandBufferReleaseCommandExp(
  * @return UR_RESULT_SUCCESS or an error code on failure
  */
 ur_result_t validateCommandDesc(
-    ur_exp_command_buffer_command_handle_t Command,
+    kernel_command_handle *Command,
     const ur_exp_command_buffer_update_kernel_launch_desc_t *CommandDesc) {
 
   auto CommandBuffer = Command->CommandBuffer;
@@ -1699,9 +1751,14 @@ ur_result_t validateCommandDesc(
           ->mutableCommandFlags;
   logger::debug("Mutable features supported by device {}", SupportedFeatures);
 
-  // Kernel handle updates are not yet supported.
-  if (CommandDesc->hNewKernel && CommandDesc->hNewKernel != Command->Kernel) {
-    return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  UR_ASSERT(
+      !CommandDesc->hNewKernel ||
+          (SupportedFeatures & ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_INSTRUCTION),
+      UR_RESULT_ERROR_UNSUPPORTED_FEATURE);
+  // Check if the provided new kernel is in the list of valid alternatives.
+  if (CommandDesc->hNewKernel &&
+      !Command->ValidKernelHandles.count(CommandDesc->hNewKernel)) {
+    return UR_RESULT_ERROR_INVALID_VALUE;
   }
 
   if (CommandDesc->newWorkDim != Command->WorkDim &&
@@ -1754,7 +1811,7 @@ ur_result_t validateCommandDesc(
  * @return UR_RESULT_SUCCESS or an error code on failure
  */
 ur_result_t updateKernelCommand(
-    ur_exp_command_buffer_command_handle_t Command,
+    kernel_command_handle *Command,
     const ur_exp_command_buffer_update_kernel_launch_desc_t *CommandDesc) {
 
   // We need the created descriptors to live till the point when
@@ -1769,12 +1826,29 @@ ur_result_t updateKernelCommand(
 
   const auto CommandBuffer = Command->CommandBuffer;
   const void *NextDesc = nullptr;
+  auto Platform = CommandBuffer->Context->getPlatform();
 
   uint32_t Dim = CommandDesc->newWorkDim;
   size_t *NewGlobalWorkOffset = CommandDesc->pNewGlobalWorkOffset;
   size_t *NewLocalWorkSize = CommandDesc->pNewLocalWorkSize;
   size_t *NewGlobalWorkSize = CommandDesc->pNewGlobalWorkSize;
 
+  // Kernel handle must be updated first for a given CommandId if required
+  ur_kernel_handle_t NewKernel = CommandDesc->hNewKernel;
+  if (NewKernel && Command->Kernel != NewKernel) {
+    ze_kernel_handle_t ZeKernelTranslated = nullptr;
+    ZE2UR_CALL(
+        zelLoaderTranslateHandle,
+        (ZEL_HANDLE_KERNEL, NewKernel->ZeKernel, (void **)&ZeKernelTranslated));
+
+    ZE2UR_CALL(Platform->ZeMutableCmdListExt
+                   .zexCommandListUpdateMutableCommandKernelsExp,
+               (CommandBuffer->ZeComputeCommandListTranslated, 1,
+                &Command->CommandId, &ZeKernelTranslated));
+    // Set current kernel to be the new kernel
+    Command->Kernel = NewKernel;
+  }
+
   // Check if a new global offset is provided.
   if (NewGlobalWorkOffset && Dim > 0) {
     auto MutableGroupOffestDesc =
@@ -1973,7 +2047,6 @@ ur_result_t updateKernelCommand(
   MutableCommandDesc.pNext = NextDesc;
   MutableCommandDesc.flags = 0;
 
-  auto Platform = CommandBuffer->Context->getPlatform();
   ZE2UR_CALL(
       Platform->ZeMutableCmdListExt.zexCommandListUpdateMutableCommandsExp,
       (CommandBuffer->ZeComputeCommandListTranslated, &MutableCommandDesc));
@@ -2009,18 +2082,22 @@ ur_result_t urCommandBufferUpdateKernelLaunchExp(
     const ur_exp_command_buffer_update_kernel_launch_desc_t *CommandDesc) {
   UR_ASSERT(Command->CommandBuffer->IsUpdatable,
             UR_RESULT_ERROR_INVALID_OPERATION);
-  UR_ASSERT(Command->Kernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  auto KernelCommandHandle = static_cast<kernel_command_handle *>(Command);
+
+  UR_ASSERT(KernelCommandHandle->Kernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
   // Lock command, kernel and command buffer for update.
   std::scoped_lock<ur_shared_mutex, ur_shared_mutex, ur_shared_mutex> Guard(
-      Command->Mutex, Command->CommandBuffer->Mutex, Command->Kernel->Mutex);
+      Command->Mutex, Command->CommandBuffer->Mutex,
+      KernelCommandHandle->Kernel->Mutex);
 
   UR_ASSERT(Command->CommandBuffer->IsFinalized,
             UR_RESULT_ERROR_INVALID_OPERATION);
 
-  UR_CALL(validateCommandDesc(Command, CommandDesc));
+  UR_CALL(validateCommandDesc(KernelCommandHandle, CommandDesc));
   UR_CALL(waitForOngoingExecution(Command->CommandBuffer));
-  UR_CALL(updateKernelCommand(Command, CommandDesc));
+  UR_CALL(updateKernelCommand(KernelCommandHandle, CommandDesc));
 
   ZE2UR_CALL(zeCommandListClose,
              (Command->CommandBuffer->ZeComputeCommandList));
diff --git a/source/adapters/level_zero/command_buffer.hpp b/source/adapters/level_zero/command_buffer.hpp
index 156e0e5c24..d069f301fb 100644
--- a/source/adapters/level_zero/command_buffer.hpp
+++ b/source/adapters/level_zero/command_buffer.hpp
@@ -145,18 +145,31 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object {
 
 struct ur_exp_command_buffer_command_handle_t_ : public _ur_object {
   ur_exp_command_buffer_command_handle_t_(ur_exp_command_buffer_handle_t,
-                                          uint64_t, uint32_t, bool,
-                                          ur_kernel_handle_t);
+                                          uint64_t);
 
-  ~ur_exp_command_buffer_command_handle_t_();
+  virtual ~ur_exp_command_buffer_command_handle_t_();
 
   // Command-buffer of this command.
   ur_exp_command_buffer_handle_t CommandBuffer;
-
+  // L0 command ID identifying this command
   uint64_t CommandId;
+};
+
+struct kernel_command_handle : public ur_exp_command_buffer_command_handle_t_ {
+  kernel_command_handle(ur_exp_command_buffer_handle_t CommandBuffer,
+                        ur_kernel_handle_t Kernel, uint64_t CommandId,
+                        uint32_t WorkDim, bool UserDefinedLocalSize,
+                        uint32_t NumKernelAlternatives,
+                        ur_kernel_handle_t *KernelAlternatives);
+
+  ~kernel_command_handle();
+
   // Work-dimension the command was originally created with.
   uint32_t WorkDim;
   // Set to true if the user set the local work size on command creation.
   bool UserDefinedLocalSize;
+  // Currently active kernel handle
   ur_kernel_handle_t Kernel;
+  // Storage for valid kernel alternatives for this command.
+  std::unordered_set<ur_kernel_handle_t> ValidKernelHandles;
 };
diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp
index 865edebc08..3997c837f8 100644
--- a/source/adapters/level_zero/device.cpp
+++ b/source/adapters/level_zero/device.cpp
@@ -1048,6 +1048,10 @@ ur_result_t urDeviceGetInfo(
       UpdateCapabilities |=
           UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET;
     }
+    if (supportsFlags(ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_INSTRUCTION)) {
+      UpdateCapabilities |=
+          UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE;
+    }
     return ReturnValue(UpdateCapabilities);
   }
   case UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP:
diff --git a/source/adapters/level_zero/platform.cpp b/source/adapters/level_zero/platform.cpp
index 520b52a1c8..0237b62863 100644
--- a/source/adapters/level_zero/platform.cpp
+++ b/source/adapters/level_zero/platform.cpp
@@ -319,6 +319,22 @@ ur_result_t ur_platform_handle_t_::initialize() {
     ZeMutableCmdListExt.Supported |=
         ZeMutableCmdListExt.zexCommandListUpdateMutableCommandWaitEventsExp !=
         nullptr;
+    ZeMutableCmdListExt.zexCommandListUpdateMutableCommandKernelsExp =
+        (ze_pfnCommandListUpdateMutableCommandKernelsExp_t)
+            ur_loader::LibLoader::getFunctionPtr(
+                GlobalAdapter->processHandle,
+                "zeCommandListUpdateMutableCommandKernelsExp");
+    ZeMutableCmdListExt.Supported |=
+        ZeMutableCmdListExt.zexCommandListUpdateMutableCommandKernelsExp !=
+        nullptr;
+    ZeMutableCmdListExt.zexCommandListGetNextCommandIdWithKernelsExp =
+        (ze_pfnCommandListGetNextCommandIdWithKernelsExp_t)
+            ur_loader::LibLoader::getFunctionPtr(
+                GlobalAdapter->processHandle,
+                "zeCommandListGetNextCommandIdWithKernelsExp");
+    ZeMutableCmdListExt.Supported |=
+        ZeMutableCmdListExt.zexCommandListGetNextCommandIdWithKernelsExp !=
+        nullptr;
   } else {
     ZeMutableCmdListExt.Supported |=
         (ZE_CALL_NOCHECK(
@@ -353,6 +369,21 @@ ur_result_t ur_platform_handle_t_::initialize() {
                   &ZeMutableCmdListExt
                        .zexCommandListUpdateMutableCommandWaitEventsExp))) ==
          0);
+    ZeMutableCmdListExt.Supported &=
+        (ZE_CALL_NOCHECK(
+             zeDriverGetExtensionFunctionAddress,
+             (ZeDriver, "zeCommandListUpdateMutableCommandKernelsExp",
+              reinterpret_cast<void **>(
+                  &ZeMutableCmdListExt
+                       .zexCommandListUpdateMutableCommandKernelsExp))) == 0);
+
+    ZeMutableCmdListExt.Supported &=
+        (ZE_CALL_NOCHECK(
+             zeDriverGetExtensionFunctionAddress,
+             (ZeDriver, "zeCommandListGetNextCommandIdWithKernelsExp",
+              reinterpret_cast<void **>(
+                  &ZeMutableCmdListExt
+                       .zexCommandListGetNextCommandIdWithKernelsExp))) == 0);
   }
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/level_zero/platform.hpp b/source/adapters/level_zero/platform.hpp
index 413bb5c48a..468c602b10 100644
--- a/source/adapters/level_zero/platform.hpp
+++ b/source/adapters/level_zero/platform.hpp
@@ -107,5 +107,11 @@ struct ur_platform_handle_t_ : public _ur_platform {
     ze_result_t (*zexCommandListUpdateMutableCommandWaitEventsExp)(
         ze_command_list_handle_t, uint64_t, uint32_t,
         ze_event_handle_t *) = nullptr;
+    ze_result_t (*zexCommandListUpdateMutableCommandKernelsExp)(
+        ze_command_list_handle_t, uint32_t, uint64_t *,
+        ze_kernel_handle_t *) = nullptr;
+    ze_result_t (*zexCommandListGetNextCommandIdWithKernelsExp)(
+        ze_command_list_handle_t, const ze_mutable_command_id_exp_desc_t *,
+        uint32_t, ze_kernel_handle_t *, uint64_t *) = nullptr;
   } ZeMutableCmdListExt;
 };

From 3ed2dcfe7d746033c1539db3a22da60407cffb2d Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Thu, 21 Nov 2024 15:45:10 +0000
Subject: [PATCH 11/36] Disable CFI checks when ASAN is specified

This causes linker errors, and presumably isn't meaningful anyway.

Closes: #2323
---
 .github/workflows/build-fuzz-reusable.yml | 3 ---
 cmake/helpers.cmake                       | 6 ++++++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-fuzz-reusable.yml b/.github/workflows/build-fuzz-reusable.yml
index 0c7da5d478..2cbd1b87ff 100644
--- a/.github/workflows/build-fuzz-reusable.yml
+++ b/.github/workflows/build-fuzz-reusable.yml
@@ -47,8 +47,6 @@ jobs:
         cmake --build build -j $(nproc)
 
     - name: Configure CMake
-      # CFI sanitization (or flto?) seems to cause linking to fail
-      # https://github.com/oneapi-src/unified-runtime/issues/2323
       run: >
         cmake
         -B${{github.workspace}}/build
@@ -60,7 +58,6 @@ jobs:
         -DUR_USE_ASAN=ON
         -DUR_USE_UBSAN=ON
         -DUR_BUILD_ADAPTER_L0=ON
-        -DUR_USE_CFI=OFF
         -DUR_LEVEL_ZERO_LOADER_LIBRARY=${{github.workspace}}/level-zero/build/lib/libze_loader.so
         -DUR_LEVEL_ZERO_INCLUDE_DIR=${{github.workspace}}/level-zero/include/
         -DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++
diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake
index d3c8a1aa85..8cc5353609 100644
--- a/cmake/helpers.cmake
+++ b/cmake/helpers.cmake
@@ -63,6 +63,12 @@ if(CMAKE_SYSTEM_NAME STREQUAL Linux)
     check_cxx_compiler_flag("-fstack-clash-protection" CXX_HAS_FSTACK_CLASH_PROTECTION)
 endif()
 
+if (UR_USE_CFI AND UR_USE_ASAN)
+    message(WARNING "Both UR_USE_CFI and UR_USE_ASAN are ON. "
+        "Due to build errors, this is unsupported; CFI checks will be disabled")
+    set(UR_USE_CFI OFF)
+endif()
+
 if (UR_USE_CFI)
     set(SAVED_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
     set(CMAKE_REQUIRED_FLAGS "-flto -fvisibility=hidden")

From 0eb340c6b4d900919e02ee911f3e069788f21ff8 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Tue, 26 Nov 2024 16:25:47 +0000
Subject: [PATCH 12/36] Failing e2e tests no longer kill the whole build

---
 .github/workflows/e2e_core.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/e2e_core.yml b/.github/workflows/e2e_core.yml
index 32b8d58e7a..f12913c648 100644
--- a/.github/workflows/e2e_core.yml
+++ b/.github/workflows/e2e_core.yml
@@ -190,7 +190,7 @@ jobs:
 
     - name: Run e2e tests
       id: tests
-      run: ninja -C build-e2e check-sycl-e2e
+      run: ninja -C build-e2e check-sycl-e2e || echo "e2e tests have failed. Ignoring failure."
 
     # FIXME: Requires pull-request: write permissions but this is only granted
     # on pull requests from forks if using pull_request_target workflow

From 6fca68d996a4476136791a70f6bdb5b07276a6b8 Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Mon, 28 Oct 2024 10:40:15 -0700
Subject: [PATCH 13/36] Define exp extension

---
 .../core/EXP-2D-BLOCK-ARRAY-CAPABILITIES.rst  | 62 +++++++++++++++++++
 .../core/exp-2d-block-array-capabilities.yml  | 36 +++++++++++
 2 files changed, 98 insertions(+)
 create mode 100644 scripts/core/EXP-2D-BLOCK-ARRAY-CAPABILITIES.rst
 create mode 100644 scripts/core/exp-2d-block-array-capabilities.yml

diff --git a/scripts/core/EXP-2D-BLOCK-ARRAY-CAPABILITIES.rst b/scripts/core/EXP-2D-BLOCK-ARRAY-CAPABILITIES.rst
new file mode 100644
index 0000000000..765b07cfb3
--- /dev/null
+++ b/scripts/core/EXP-2D-BLOCK-ARRAY-CAPABILITIES.rst
@@ -0,0 +1,62 @@
+<%
+    OneApi=tags['$OneApi']
+    x=tags['$x']
+    X=x.upper()
+%>
+
+.. _experimental-2D-block-array-capabilities:
+
+================================================================================
+2D Block Array Capabilities
+================================================================================
+
+.. warning::
+
+    Experimental features:
+
+    *   May be replaced, updated, or removed at any time.
+    *   Do not require maintaining API/ABI stability of their own additions over
+        time.
+    *   Do not require conformance testing of their own additions.
+
+
+Motivation
+--------------------------------------------------------------------------------
+Some Intel GPU devices support 2D block array operations which may be used to optimize applications on Intel GPUs.
+This extension provides a device descriptor which allows to query the 2D block array capabilities of a device.
+
+API
+--------------------------------------------------------------------------------
+
+Enums
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* ${x}_device_info_t
+    * ${X}_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP
+
+* ${x}_exp_device_2d_block_array_capability_flags_t
+    * ${X}_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_LOAD
+    * ${X}_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_STORE
+
+Changelog
+--------------------------------------------------------------------------------
+
++-----------+------------------------+
+| Revision  | Changes                |
++===========+========================+
+| 1.0       | Initial Draft          |
++-----------+------------------------+
+
+
+Support
+--------------------------------------------------------------------------------
+
+Adapters which support this experimental feature *must* return ${X}_RESULT_SUCCESS from
+the ${x}DeviceGetInfo call with the new ${X}_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP
+device descriptor.
+
+
+Contributors
+--------------------------------------------------------------------------------
+
+* Artur Gainullin `artur.gainullin@intel.com <artur.gainullin@intel.com>`_
diff --git a/scripts/core/exp-2d-block-array-capabilities.yml b/scripts/core/exp-2d-block-array-capabilities.yml
new file mode 100644
index 0000000000..ec62c4bb96
--- /dev/null
+++ b/scripts/core/exp-2d-block-array-capabilities.yml
@@ -0,0 +1,36 @@
+#
+# Copyright (C) 2024 Intel Corporation
+#
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# See YaML.md for syntax definition
+#
+--- #--------------------------------------------------------------------------
+type: header
+desc: "Intel $OneApi Unified Runtime Experimental device descriptor for querying Intel device 2D block array capabilities"
+ordinal: "99"
+--- #--------------------------------------------------------------------------
+type: enum
+extend: true
+typed_etors: true
+desc: "Extension enum to $x_device_info_t to query Intel device 2D block array capabilities."
+name: $x_device_info_t
+etors:
+    - name: 2D_BLOCK_ARRAY_CAPABILITIES_EXP
+      value: "0x2022"
+      desc: "[$x_exp_device_2d_block_array_capability_flags_t] return a bit-field of Intel GPU 2D block array capabilities"
+--- #--------------------------------------------------------------------------
+type: enum
+desc: "Intel GPU 2D block array capabilities"
+class: $xDevice
+name: $x_exp_device_2d_block_array_capability_flags_t
+etors:
+    - name: LOAD
+      desc: "Load instructions are supported"
+      value: "$X_BIT(0)"
+    - name: STORE
+      desc: "Store instructions are supported"
+      value: "$X_BIT(1)"
+

From 42c2b660a618a86f2fc5b10096dc1fbb5129468e Mon Sep 17 00:00:00 2001
From: "Gainullin, Artur" <artur.gainullin@intel.com>
Date: Tue, 19 Nov 2024 14:11:06 -0800
Subject: [PATCH 14/36] Genereate sources

---
 include/ur_api.h                              | 25 +++++-
 include/ur_print.h                            |  8 ++
 include/ur_print.hpp                          | 78 +++++++++++++++++++
 source/loader/layers/validation/ur_valddi.cpp |  2 +-
 source/loader/loader.def.in                   |  1 +
 source/loader/loader.map.in                   |  1 +
 source/loader/ur_libapi.cpp                   |  2 +-
 source/loader/ur_print.cpp                    |  8 ++
 source/ur_api.cpp                             |  2 +-
 tools/urinfo/urinfo.hpp                       |  3 +
 10 files changed, 126 insertions(+), 4 deletions(-)

diff --git a/include/ur_api.h b/include/ur_api.h
index 3205fcb207..eb8b07221c 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -1705,6 +1705,8 @@ typedef enum ur_device_info_t {
     UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP = 0x2020,      ///< [::ur_bool_t] returns true if the device supports enqueueing of native
                                                                      ///< work
     UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP = 0x2021,                    ///< [::ur_bool_t] returns true if the device supports low-power events.
+    UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP = 0x2022,         ///< [::ur_exp_device_2d_block_array_capability_flags_t] return a bit-field
+                                                                     ///< of Intel GPU 2D block array capabilities
     /// @cond
     UR_DEVICE_INFO_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -1730,7 +1732,7 @@ typedef enum ur_device_info_t {
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP < propName`
+///         + `::UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP < propName`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
 ///         + If `propName` is not supported by the adapter.
 ///     - ::UR_RESULT_ERROR_INVALID_SIZE
@@ -7428,6 +7430,27 @@ urEnqueueWriteHostPipe(
                                               ///< an element of the phEventWaitList array.
 );
 
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Unified Runtime Experimental device descriptor for querying Intel device 2D block array capabilities
+#if !defined(__GNUC__)
+#pragma region 2d_block_array_capabilities_(experimental)
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intel GPU 2D block array capabilities
+typedef uint32_t ur_exp_device_2d_block_array_capability_flags_t;
+typedef enum ur_exp_device_2d_block_array_capability_flag_t {
+    UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_LOAD = UR_BIT(0),  ///< Load instructions are supported
+    UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_STORE = UR_BIT(1), ///< Store instructions are supported
+    /// @cond
+    UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_FORCE_UINT32 = 0x7fffffff
+    /// @endcond
+
+} ur_exp_device_2d_block_array_capability_flag_t;
+/// @brief Bit Mask for validating ur_exp_device_2d_block_array_capability_flags_t
+#define UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAGS_MASK 0xfffffffc
+
 #if !defined(__GNUC__)
 #pragma endregion
 #endif
diff --git a/include/ur_print.h b/include/ur_print.h
index 93597d232f..c2adb18067 100644
--- a/include/ur_print.h
+++ b/include/ur_print.h
@@ -874,6 +874,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintMapFlags(enum ur_map_flag_t value, ch
 ///         - `buff_size < out_size`
 UR_APIEXPORT ur_result_t UR_APICALL urPrintUsmMigrationFlags(enum ur_usm_migration_flag_t value, char *buffer, const size_t buff_size, size_t *out_size);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_exp_device_2d_block_array_capability_flag_t enum
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_SIZE
+///         - `buff_size < out_size`
+UR_APIEXPORT ur_result_t UR_APICALL urPrintExpDevice_2dBlockArrayCapabilityFlags(enum ur_exp_device_2d_block_array_capability_flag_t value, char *buffer, const size_t buff_size, size_t *out_size);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print ur_exp_image_copy_flag_t enum
 /// @returns
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index 1d28b8eac0..8888a74f91 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -194,6 +194,9 @@ inline ur_result_t printFlag<ur_map_flag_t>(std::ostream &os, uint32_t flag);
 template <>
 inline ur_result_t printFlag<ur_usm_migration_flag_t>(std::ostream &os, uint32_t flag);
 
+template <>
+inline ur_result_t printFlag<ur_exp_device_2d_block_array_capability_flag_t>(std::ostream &os, uint32_t flag);
+
 template <>
 inline ur_result_t printFlag<ur_exp_image_copy_flag_t>(std::ostream &os, uint32_t flag);
 
@@ -328,6 +331,7 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
 inline std::ostream &operator<<(std::ostream &os, enum ur_execution_info_t value);
 inline std::ostream &operator<<(std::ostream &os, enum ur_map_flag_t value);
 inline std::ostream &operator<<(std::ostream &os, enum ur_usm_migration_flag_t value);
+inline std::ostream &operator<<(std::ostream &os, enum ur_exp_device_2d_block_array_capability_flag_t value);
 inline std::ostream &operator<<(std::ostream &os, enum ur_exp_image_copy_flag_t value);
 inline std::ostream &operator<<(std::ostream &os, enum ur_exp_sampler_cubemap_filter_mode_t value);
 inline std::ostream &operator<<(std::ostream &os, enum ur_exp_external_mem_type_t value);
@@ -2665,6 +2669,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) {
     case UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP:
         os << "UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP";
         break;
+    case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP:
+        os << "UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP";
+        break;
     default:
         os << "unknown enumerator";
         break;
@@ -4472,6 +4479,19 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_device_info
 
         os << ")";
     } break;
+    case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP: {
+        const ur_exp_device_2d_block_array_capability_flags_t *tptr = (const ur_exp_device_2d_block_array_capability_flags_t *)ptr;
+        if (sizeof(ur_exp_device_2d_block_array_capability_flags_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_exp_device_2d_block_array_capability_flags_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        ur::details::printFlag<ur_exp_device_2d_block_array_capability_flag_t>(os,
+                                                                               *tptr);
+
+        os << ")";
+    } break;
     default:
         os << "unknown enumerator";
         return UR_RESULT_ERROR_INVALID_ENUMERATION;
@@ -9455,6 +9475,64 @@ inline ur_result_t printFlag<ur_usm_migration_flag_t>(std::ostream &os, uint32_t
 }
 } // namespace ur::details
 ///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ur_exp_device_2d_block_array_capability_flag_t type
+/// @returns
+///     std::ostream &
+inline std::ostream &operator<<(std::ostream &os, enum ur_exp_device_2d_block_array_capability_flag_t value) {
+    switch (value) {
+    case UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_LOAD:
+        os << "UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_LOAD";
+        break;
+    case UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_STORE:
+        os << "UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_STORE";
+        break;
+    default:
+        os << "unknown enumerator";
+        break;
+    }
+    return os;
+}
+
+namespace ur::details {
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_exp_device_2d_block_array_capability_flag_t flag
+template <>
+inline ur_result_t printFlag<ur_exp_device_2d_block_array_capability_flag_t>(std::ostream &os, uint32_t flag) {
+    uint32_t val = flag;
+    bool first = true;
+
+    if ((val & UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_LOAD) == (uint32_t)UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_LOAD) {
+        val ^= (uint32_t)UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_LOAD;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_LOAD;
+    }
+
+    if ((val & UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_STORE) == (uint32_t)UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_STORE) {
+        val ^= (uint32_t)UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_STORE;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_STORE;
+    }
+    if (val != 0) {
+        std::bitset<32> bits(val);
+        if (!first) {
+            os << " | ";
+        }
+        os << "unknown bit flags " << bits;
+    } else if (first) {
+        os << "0";
+    }
+    return UR_RESULT_SUCCESS;
+}
+} // namespace ur::details
+///////////////////////////////////////////////////////////////////////////////
 /// @brief Print operator for the ur_exp_image_copy_flag_t type
 /// @returns
 ///     std::ostream &
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index 195c1d3c69..b3969de10f 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -518,7 +518,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetInfo(
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
 
-        if (UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP < propName) {
+        if (UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP < propName) {
             return UR_RESULT_ERROR_INVALID_ENUMERATION;
         }
 
diff --git a/source/loader/loader.def.in b/source/loader/loader.def.in
index b5c3bde6ea..a336da153d 100644
--- a/source/loader/loader.def.in
+++ b/source/loader/loader.def.in
@@ -312,6 +312,7 @@ EXPORTS
 	urPrintExpCommandBufferUpdateMemobjArgDesc
 	urPrintExpCommandBufferUpdatePointerArgDesc
 	urPrintExpCommandBufferUpdateValueArgDesc
+	urPrintExpDevice_2dBlockArrayCapabilityFlags
 	urPrintExpEnqueueExtFlags
 	urPrintExpEnqueueExtProperties
 	urPrintExpEnqueueNativeCommandFlags
diff --git a/source/loader/loader.map.in b/source/loader/loader.map.in
index 778a5da065..59a8a8d107 100644
--- a/source/loader/loader.map.in
+++ b/source/loader/loader.map.in
@@ -312,6 +312,7 @@
 		urPrintExpCommandBufferUpdateMemobjArgDesc;
 		urPrintExpCommandBufferUpdatePointerArgDesc;
 		urPrintExpCommandBufferUpdateValueArgDesc;
+		urPrintExpDevice_2dBlockArrayCapabilityFlags;
 		urPrintExpEnqueueExtFlags;
 		urPrintExpEnqueueExtProperties;
 		urPrintExpEnqueueNativeCommandFlags;
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index 8dca26d4ba..3340363737 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -870,7 +870,7 @@ ur_result_t UR_APICALL urDeviceGetSelected(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP < propName`
+///         + `::UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP < propName`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
 ///         + If `propName` is not supported by the adapter.
 ///     - ::UR_RESULT_ERROR_INVALID_SIZE
diff --git a/source/loader/ur_print.cpp b/source/loader/ur_print.cpp
index d8206edb3f..6b1cbfd5ee 100644
--- a/source/loader/ur_print.cpp
+++ b/source/loader/ur_print.cpp
@@ -879,6 +879,14 @@ ur_result_t urPrintUsmMigrationFlags(enum ur_usm_migration_flag_t value,
     return str_copy(&ss, buffer, buff_size, out_size);
 }
 
+ur_result_t urPrintExpDevice_2dBlockArrayCapabilityFlags(
+    enum ur_exp_device_2d_block_array_capability_flag_t value, char *buffer,
+    const size_t buff_size, size_t *out_size) {
+    std::stringstream ss;
+    ss << value;
+    return str_copy(&ss, buffer, buff_size, out_size);
+}
+
 ur_result_t urPrintExpImageCopyFlags(enum ur_exp_image_copy_flag_t value,
                                      char *buffer, const size_t buff_size,
                                      size_t *out_size) {
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index 22c76f122e..853d61472e 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -771,7 +771,7 @@ ur_result_t UR_APICALL urDeviceGetSelected(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP < propName`
+///         + `::UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP < propName`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
 ///         + If `propName` is not supported by the adapter.
 ///     - ::UR_RESULT_ERROR_INVALID_SIZE
diff --git a/tools/urinfo/urinfo.hpp b/tools/urinfo/urinfo.hpp
index ee7fe52834..37c7a80328 100644
--- a/tools/urinfo/urinfo.hpp
+++ b/tools/urinfo/urinfo.hpp
@@ -420,5 +420,8 @@ inline void printDeviceInfos(ur_device_handle_t hDevice,
         hDevice, UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP);
     std::cout << prefix;
     printDeviceInfo<ur_bool_t>(hDevice, UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP);
+    std::cout << prefix;
+    printDeviceInfo<ur_exp_device_2d_block_array_capability_flags_t>(
+        hDevice, UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP);
 }
 } // namespace urinfo

From 66025f0d1ed3d0be5d76948a33c37a6cc4b5e282 Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Thu, 24 Oct 2024 14:48:12 -0700
Subject: [PATCH 15/36] Add implementation

---
 cmake/FetchLevelZero.cmake                  | 27 +++++++++++++++-
 source/adapters/cuda/device.cpp             |  1 +
 source/adapters/hip/device.cpp              |  1 +
 source/adapters/level_zero/CMakeLists.txt   |  2 ++
 source/adapters/level_zero/common.cpp       |  9 ++++++
 source/adapters/level_zero/device.cpp       | 35 +++++++++++++++++++++
 source/adapters/level_zero/device.hpp       | 10 ++++--
 source/adapters/native_cpu/device.cpp       |  2 ++
 source/adapters/opencl/device.cpp           |  3 +-
 test/adapters/level_zero/CMakeLists.txt     |  1 +
 test/adapters/level_zero/v2/CMakeLists.txt  |  1 +
 test/conformance/device/urDeviceGetInfo.cpp |  7 +++--
 12 files changed, 92 insertions(+), 7 deletions(-)

diff --git a/cmake/FetchLevelZero.cmake b/cmake/FetchLevelZero.cmake
index 6d108c8a6f..3bc745f3d0 100644
--- a/cmake/FetchLevelZero.cmake
+++ b/cmake/FetchLevelZero.cmake
@@ -7,6 +7,8 @@ set(UR_LEVEL_ZERO_LOADER_LIBRARY "" CACHE FILEPATH "Path of the Level Zero Loade
 set(UR_LEVEL_ZERO_INCLUDE_DIR "" CACHE FILEPATH "Directory containing the Level Zero Headers")
 set(UR_LEVEL_ZERO_LOADER_REPO "" CACHE STRING "Github repo to get the Level Zero loader sources from")
 set(UR_LEVEL_ZERO_LOADER_TAG "" CACHE STRING " GIT tag of the Level Loader taken from github repo")
+set(UR_COMPUTE_RUNTIME_REPO "" CACHE STRING "Github repo to get the compute runtime sources from")
+set(UR_COMPUTE_RUNTIME_TAG "" CACHE STRING " GIT tag of the compute runtime taken from github repo")
 
 # Copy Level Zero loader/headers locally to the build to avoid leaking their path.
 set(LEVEL_ZERO_COPY_DIR ${CMAKE_CURRENT_BINARY_DIR}/level_zero_loader)
@@ -87,8 +89,31 @@ target_link_libraries(LevelZeroLoader
     INTERFACE "${LEVEL_ZERO_LIB_NAME}"
 )
 
+file(GLOB LEVEL_ZERO_LOADER_API_HEADERS "${LEVEL_ZERO_INCLUDE_DIR}/*.h")
+file(COPY ${LEVEL_ZERO_LOADER_API_HEADERS} DESTINATION ${LEVEL_ZERO_INCLUDE_DIR}/level_zero)
 add_library(LevelZeroLoader-Headers INTERFACE)
 target_include_directories(LevelZeroLoader-Headers
-    INTERFACE "$<BUILD_INTERFACE:${LEVEL_ZERO_INCLUDE_DIR}>"
+    INTERFACE "$<BUILD_INTERFACE:${LEVEL_ZERO_INCLUDE_DIR};${LEVEL_ZERO_INCLUDE_DIR}/level_zero>"
+              "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>"
+)
+
+if (UR_COMPUTE_RUNTIME_REPO STREQUAL "")
+set(UR_COMPUTE_RUNTIME_REPO "https://github.com/intel/compute-runtime.git")
+endif()
+if (UR_COMPUTE_RUNTIME_TAG STREQUAL "")
+set(UR_COMPUTE_RUNTIME_TAG 24.39.31294.12)
+endif()
+include(FetchContent)
+# Sparse fetch only the dir with level zero headers to avoid pulling in the entire compute-runtime.
+FetchContentSparse_Declare(compute-runtime-level-zero-headers ${UR_COMPUTE_RUNTIME_REPO} "${UR_COMPUTE_RUNTIME_TAG}" "level_zero/include")
+FetchContent_GetProperties(compute-runtime-level-zero-headers)
+if(NOT compute-runtime-level-zero-headers_POPULATED)
+  FetchContent_Populate(compute-runtime-level-zero-headers)
+endif()
+add_library(ComputeRuntimeLevelZero-Headers INTERFACE)
+set(COMPUTE_RUNTIME_LEVEL_ZERO_INCLUDE "${compute-runtime-level-zero-headers_SOURCE_DIR}/../..")
+message(STATUS "Level Zero Adapter: Using Level Zero headers from ${COMPUTE_RUNTIME_LEVEL_ZERO_INCLUDE}")
+target_include_directories(ComputeRuntimeLevelZero-Headers
+    INTERFACE "$<BUILD_INTERFACE:${COMPUTE_RUNTIME_LEVEL_ZERO_INCLUDE}>"
               "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>"
 )
diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp
index cb6b757dd3..a6bdc64f41 100644
--- a/source/adapters/cuda/device.cpp
+++ b/source/adapters/cuda/device.cpp
@@ -1087,6 +1087,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE:
   case UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
   case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
+  case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP:
     return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
 
   case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP:
diff --git a/source/adapters/hip/device.cpp b/source/adapters/hip/device.cpp
index 5271f73709..8601e8b914 100644
--- a/source/adapters/hip/device.cpp
+++ b/source/adapters/hip/device.cpp
@@ -904,6 +904,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_BFLOAT16:
   case UR_DEVICE_INFO_IL_VERSION:
   case UR_DEVICE_INFO_ASYNC_BARRIER:
+  case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP:
     return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: {
     int DriverVersion = 0;
diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt
index 6465ebaa51..8306436097 100644
--- a/source/adapters/level_zero/CMakeLists.txt
+++ b/source/adapters/level_zero/CMakeLists.txt
@@ -109,6 +109,7 @@ if(UR_BUILD_ADAPTER_L0)
         ${PROJECT_NAME}::umf
         LevelZeroLoader
         LevelZeroLoader-Headers
+        ComputeRuntimeLevelZero-Headers
     )
 
     target_include_directories(ur_adapter_level_zero PRIVATE
@@ -203,6 +204,7 @@ if(UR_BUILD_ADAPTER_L0_V2)
         ${PROJECT_NAME}::umf
         LevelZeroLoader
         LevelZeroLoader-Headers
+        ComputeRuntimeLevelZero-Headers
     )
 
     target_include_directories(ur_adapter_level_zero_v2 PRIVATE
diff --git a/source/adapters/level_zero/common.cpp b/source/adapters/level_zero/common.cpp
index f5d8b20014..da7f624013 100644
--- a/source/adapters/level_zero/common.cpp
+++ b/source/adapters/level_zero/common.cpp
@@ -11,6 +11,7 @@
 #include "common.hpp"
 #include "logger/ur_logger.hpp"
 #include "usm.hpp"
+#include <level_zero/include/ze_intel_gpu.h>
 
 ur_result_t ze2urResult(ze_result_t ZeResult) {
   if (ZeResult == ZE_RESULT_SUCCESS)
@@ -330,6 +331,14 @@ template <> zes_structure_type_t getZesStructureType<zes_mem_properties_t>() {
   return ZES_STRUCTURE_TYPE_MEM_PROPERTIES;
 }
 
+#ifdef ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME
+template <>
+ze_structure_type_t
+getZeStructureType<ze_intel_device_block_array_exp_properties_t>() {
+  return ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES;
+}
+#endif // ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME
+
 // Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR
 thread_local ur_result_t ErrorMessageCode = UR_RESULT_SUCCESS;
 thread_local char ErrorMessage[MaxMessageSize];
diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp
index 865edebc08..add3448e91 100644
--- a/source/adapters/level_zero/device.cpp
+++ b/source/adapters/level_zero/device.cpp
@@ -1153,6 +1153,30 @@ ur_result_t urDeviceGetInfo(
     return ReturnValue(true);
   case UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP:
     return ReturnValue(false);
+  case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP: {
+#ifdef ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME
+    const auto ZeDeviceBlockArrayFlags =
+        Device->ZeDeviceBlockArrayProperties->flags;
+
+    auto supportsFlags =
+        [&](ze_intel_device_block_array_exp_flags_t RequiredFlags) {
+          return (ZeDeviceBlockArrayFlags & RequiredFlags) == RequiredFlags;
+        };
+
+    ur_exp_device_2d_block_array_capability_flags_t BlockArrayCapabilities = 0;
+    if (supportsFlags(ZE_INTEL_DEVICE_EXP_FLAG_2D_BLOCK_LOAD)) {
+      BlockArrayCapabilities |=
+          UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_LOAD;
+    }
+    if (supportsFlags(ZE_INTEL_DEVICE_EXP_FLAG_2D_BLOCK_STORE)) {
+      BlockArrayCapabilities |=
+          UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_STORE;
+    }
+    return ReturnValue(BlockArrayCapabilities);
+#else
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+#endif
+  }
   default:
     logger::error("Unsupported ParamName in urGetDeviceInfo");
     logger::error("ParamNameParamName={}(0x{})", ParamName,
@@ -1584,6 +1608,17 @@ ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal,
         ZE_CALL_NOCHECK(zeDeviceGetProperties, (ZeDevice, &P));
       };
 
+#ifdef ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME
+  ZeDeviceBlockArrayProperties.Compute =
+      [ZeDevice](
+          ZeStruct<ze_intel_device_block_array_exp_properties_t> &Properties) {
+        ze_device_properties_t P;
+        P.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+        P.pNext = &Properties;
+        ZE_CALL_NOCHECK(zeDeviceGetProperties, (ZeDevice, &P));
+      };
+#endif // ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME
+
   ImmCommandListUsed = this->useImmediateCommandLists();
 
   uint32_t numQueueGroups = 0;
diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp
index 3d78a99b97..512a5ff714 100644
--- a/source/adapters/level_zero/device.hpp
+++ b/source/adapters/level_zero/device.hpp
@@ -18,14 +18,14 @@
 #include <unordered_map>
 #include <vector>
 
+#include "adapters/level_zero/platform.hpp"
+#include "common.hpp"
+#include <level_zero/include/ze_intel_gpu.h>
 #include <ur/ur.hpp>
 #include <ur_ddi.h>
 #include <ze_api.h>
 #include <zes_api.h>
 
-#include "adapters/level_zero/platform.hpp"
-#include "common.hpp"
-
 enum EventsScope {
   // All events are created host-visible.
   AllHostVisible,
@@ -224,6 +224,10 @@ struct ur_device_handle_t_ : _ur_object {
   ZeCache<struct ze_global_memsize> ZeGlobalMemSize;
   ZeCache<ZeStruct<ze_mutable_command_list_exp_properties_t>>
       ZeDeviceMutableCmdListsProperties;
+#ifdef ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME
+  ZeCache<ZeStruct<ze_intel_device_block_array_exp_properties_t>>
+      ZeDeviceBlockArrayProperties;
+#endif // ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME
 
   // Map device bindless image offset to corresponding host image handle.
   std::unordered_map<ur_exp_image_native_handle_t, ze_image_handle_t>
diff --git a/source/adapters/native_cpu/device.cpp b/source/adapters/native_cpu/device.cpp
index b7c454315f..036e002d21 100644
--- a/source/adapters/native_cpu/device.cpp
+++ b/source/adapters/native_cpu/device.cpp
@@ -366,6 +366,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_MEMORY_CLOCK_RATE:
   case UR_DEVICE_INFO_MEMORY_BUS_WIDTH:
     return UR_RESULT_ERROR_INVALID_VALUE;
+  case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP:
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   case UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: {
     // Currently for Native CPU fences are implemented using OCK
     // builtins, so we have different capabilities than atomic operations
diff --git a/source/adapters/opencl/device.cpp b/source/adapters/opencl/device.cpp
index 70559eb52e..ce3ade309e 100644
--- a/source/adapters/opencl/device.cpp
+++ b/source/adapters/opencl/device.cpp
@@ -1093,7 +1093,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_GLOBAL_MEM_FREE:
   case UR_DEVICE_INFO_MEMORY_CLOCK_RATE:
   case UR_DEVICE_INFO_MEMORY_BUS_WIDTH:
-  case UR_DEVICE_INFO_ASYNC_BARRIER: {
+  case UR_DEVICE_INFO_ASYNC_BARRIER:
+  case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP: {
     return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   }
 
diff --git a/test/adapters/level_zero/CMakeLists.txt b/test/adapters/level_zero/CMakeLists.txt
index bfb02d37c2..8fe062b38b 100644
--- a/test/adapters/level_zero/CMakeLists.txt
+++ b/test/adapters/level_zero/CMakeLists.txt
@@ -29,6 +29,7 @@ if(UR_BUILD_ADAPTER_L0)
         target_link_libraries(test-adapter-level_zero PRIVATE
             LevelZeroLoader
             LevelZeroLoader-Headers
+            ComputeRuntimeLevelZero-Headers
         )
 
         target_include_directories(test-adapter-level_zero PRIVATE
diff --git a/test/adapters/level_zero/v2/CMakeLists.txt b/test/adapters/level_zero/v2/CMakeLists.txt
index f53cf15256..df6b43c443 100644
--- a/test/adapters/level_zero/v2/CMakeLists.txt
+++ b/test/adapters/level_zero/v2/CMakeLists.txt
@@ -25,6 +25,7 @@ function(add_unittest name)
         ${PROJECT_NAME}::umf
         LevelZeroLoader
         LevelZeroLoader-Headers
+        ComputeRuntimeLevelZero-Headers
     )
 endfunction()
 
diff --git a/test/conformance/device/urDeviceGetInfo.cpp b/test/conformance/device/urDeviceGetInfo.cpp
index e41cff97ed..23a2f7f237 100644
--- a/test/conformance/device/urDeviceGetInfo.cpp
+++ b/test/conformance/device/urDeviceGetInfo.cpp
@@ -115,7 +115,9 @@ static std::unordered_map<ur_device_info_t, size_t> device_info_size_map = {
     {UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP, sizeof(uint32_t)},
     {UR_DEVICE_INFO_COMPONENT_DEVICES, sizeof(uint32_t)},
     {UR_DEVICE_INFO_COMPOSITE_DEVICE, sizeof(ur_device_handle_t)},
-    {UR_DEVICE_INFO_USM_POOL_SUPPORT, sizeof(ur_bool_t)}};
+    {UR_DEVICE_INFO_USM_POOL_SUPPORT, sizeof(ur_bool_t)},
+    {UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP,
+     sizeof(ur_exp_device_2d_block_array_capability_flags_t)}};
 
 struct urDeviceGetInfoTest : uur::urAllDevicesTest,
                              ::testing::WithParamInterface<ur_device_info_t> {
@@ -237,7 +239,8 @@ INSTANTIATE_TEST_SUITE_P(
         UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP,           //
         UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT,                 //
         UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS,    //
-        UR_DEVICE_INFO_USM_POOL_SUPPORT                        //
+        UR_DEVICE_INFO_USM_POOL_SUPPORT,                       //
+        UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP         //
         ),
     [](const ::testing::TestParamInfo<ur_device_info_t> &info) {
         std::stringstream ss;

From a1a3a43ec562470d4c0bd347f9047c2a2320ed23 Mon Sep 17 00:00:00 2001
From: "Gainullin, Artur" <artur.gainullin@intel.com>
Date: Thu, 21 Nov 2024 10:00:15 -0800
Subject: [PATCH 16/36] OpenCL adapter implementation and update other adapters

---
 source/adapters/cuda/device.cpp       |  5 +++--
 source/adapters/hip/device.cpp        |  4 +++-
 source/adapters/native_cpu/device.cpp |  3 ++-
 source/adapters/opencl/device.cpp     | 14 ++++++++++++--
 4 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp
index a6bdc64f41..d8916ccedd 100644
--- a/source/adapters/cuda/device.cpp
+++ b/source/adapters/cuda/device.cpp
@@ -1087,9 +1087,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE:
   case UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
   case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
-  case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP:
     return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
-
+  case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP:
+    return ReturnValue(
+        static_cast<ur_exp_device_2d_block_array_capability_flags_t>(0));
   case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP:
   case UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP:
     return ReturnValue(true);
diff --git a/source/adapters/hip/device.cpp b/source/adapters/hip/device.cpp
index 8601e8b914..eed6a1c7c5 100644
--- a/source/adapters/hip/device.cpp
+++ b/source/adapters/hip/device.cpp
@@ -904,8 +904,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_BFLOAT16:
   case UR_DEVICE_INFO_IL_VERSION:
   case UR_DEVICE_INFO_ASYNC_BARRIER:
-  case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP:
     return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+  case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP:
+    return ReturnValue(
+        static_cast<ur_exp_device_2d_block_array_capability_flags_t>(0));
   case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: {
     int DriverVersion = 0;
     UR_CHECK_ERROR(hipDriverGetVersion(&DriverVersion));
diff --git a/source/adapters/native_cpu/device.cpp b/source/adapters/native_cpu/device.cpp
index 036e002d21..69c8bfc784 100644
--- a/source/adapters/native_cpu/device.cpp
+++ b/source/adapters/native_cpu/device.cpp
@@ -367,7 +367,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_MEMORY_BUS_WIDTH:
     return UR_RESULT_ERROR_INVALID_VALUE;
   case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP:
-    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+    return ReturnValue(
+        static_cast<ur_exp_device_2d_block_array_capability_flags_t>(0));
   case UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: {
     // Currently for Native CPU fences are implemented using OCK
     // builtins, so we have different capabilities than atomic operations
diff --git a/source/adapters/opencl/device.cpp b/source/adapters/opencl/device.cpp
index ce3ade309e..b33d637a84 100644
--- a/source/adapters/opencl/device.cpp
+++ b/source/adapters/opencl/device.cpp
@@ -1094,10 +1094,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_MEMORY_CLOCK_RATE:
   case UR_DEVICE_INFO_MEMORY_BUS_WIDTH:
   case UR_DEVICE_INFO_ASYNC_BARRIER:
-  case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP: {
     return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+  case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP: {
+    bool Is2DBlockIOSupported = false;
+    if (cl_adapter::checkDeviceExtensions(
+            cl_adapter::cast<cl_device_id>(hDevice),
+            {"cl_intel_subgroup_2d_block_io"},
+            Is2DBlockIOSupported) != UR_RESULT_SUCCESS ||
+        !Is2DBlockIOSupported) {
+      return ReturnValue(
+          static_cast<ur_exp_device_2d_block_array_capability_flags_t>(0));
+    }
+    return ReturnValue(UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_LOAD |
+                       UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_STORE);
   }
-
   case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: {
     cl_device_id Dev = cl_adapter::cast<cl_device_id>(hDevice);
     size_t ExtSize = 0;

From c79df596fc76d384028fa35d492e4dbba7b2e124 Mon Sep 17 00:00:00 2001
From: "Gainullin, Artur" <artur.gainullin@intel.com>
Date: Tue, 26 Nov 2024 10:08:31 -0800
Subject: [PATCH 17/36] Remove redefinitions in L0 V2

---
 source/adapters/level_zero/CMakeLists.txt            | 2 +-
 source/adapters/level_zero/v2/command_list_cache.cpp | 9 ---------
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt
index 8306436097..05a33c1224 100644
--- a/source/adapters/level_zero/CMakeLists.txt
+++ b/source/adapters/level_zero/CMakeLists.txt
@@ -58,7 +58,7 @@ if(UR_BUILD_ADAPTER_L0)
         # 'utils' target from 'level-zero-loader' includes path which is prefixed
         # in the source directory, this breaks the installation of 'utils' target.
         set_target_properties(utils PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "")
-        install(TARGETS ur_umf LevelZeroLoader LevelZeroLoader-Headers ze_loader utils
+        install(TARGETS ur_umf LevelZeroLoader LevelZeroLoader-Headers ComputeRuntimeLevelZero-Headers ze_loader utils
             EXPORT ${PROJECT_NAME}-targets
             ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
             RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
diff --git a/source/adapters/level_zero/v2/command_list_cache.cpp b/source/adapters/level_zero/v2/command_list_cache.cpp
index 9e585b80af..be4cb813fd 100644
--- a/source/adapters/level_zero/v2/command_list_cache.cpp
+++ b/source/adapters/level_zero/v2/command_list_cache.cpp
@@ -13,15 +13,6 @@
 
 #include "../device.hpp"
 
-typedef struct _zex_intel_queue_copy_operations_offload_hint_exp_desc_t {
-  ze_structure_type_t stype;
-  const void *pNext;
-  ze_bool_t copyOffloadEnabled;
-} zex_intel_queue_copy_operations_offload_hint_exp_desc_t;
-
-#define ZEX_INTEL_STRUCTURE_TYPE_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_PROPERTIES \
-  (ze_structure_type_t)0x0003001B
-
 template <>
 ze_structure_type_t
 getZeStructureType<zex_intel_queue_copy_operations_offload_hint_exp_desc_t>() {

From c13547658addf82083251ded01e32c2f91cdb000 Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Tue, 26 Nov 2024 21:19:10 +0000
Subject: [PATCH 18/36] [Doc] fix typo in LEVEL_ZERO.rst

---
 scripts/core/LEVEL_ZERO.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/core/LEVEL_ZERO.rst b/scripts/core/LEVEL_ZERO.rst
index e2ab18a0ea..fa42ecb2e7 100644
--- a/scripts/core/LEVEL_ZERO.rst
+++ b/scripts/core/LEVEL_ZERO.rst
@@ -28,7 +28,7 @@ Environment Variables
 | UR_L0_USE_RELAXED_ALLOCATION_LIMITS         | Controls the use of relaxed allocation limits.               | "0": Relaxed allocation limits are not used.                 | "0"              |
 |                                             |                                                              | "1": Relaxed allocation limits are used.                     |                  |
 +---------------------------------------------+--------------------------------------------------------------+--------------------------------------------------------------+------------------+
-| UR_L0_USE_DRIVER_IN_ORDER_LISTS             | Controls the use of in-order lists from the driver.          | "0": In-order lists from the driver are not used.            | "0"              |
+| UR_L0_USE_DRIVER_INORDER_LISTS              | Controls the use of in-order lists from the driver.          | "0": In-order lists from the driver are not used.            | "0"              |
 |                                             |                                                              | "1": In-order lists from the driver are used.                |                  |
 +---------------------------------------------+--------------------------------------------------------------+--------------------------------------------------------------+------------------+
 | UR_L0_USM_ALLOCATOR_TRACE                   | Enables tracing for the USM allocator.                       | "0": Tracing is disabled.                                    | "0"              |

From 3ad0b4bd8f781edcbbe368790eab028f19b4df3b Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Tue, 26 Nov 2024 21:39:11 +0000
Subject: [PATCH 19/36] [Test] add flaky test to match file for L0

---
 test/conformance/enqueue/enqueue_adapter_level_zero.match | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/conformance/enqueue/enqueue_adapter_level_zero.match b/test/conformance/enqueue/enqueue_adapter_level_zero.match
index 9394f1b0a0..7e3405a963 100644
--- a/test/conformance/enqueue/enqueue_adapter_level_zero.match
+++ b/test/conformance/enqueue/enqueue_adapter_level_zero.match
@@ -1,5 +1,7 @@
 {{OPT}}urEnqueueEventsWaitTest.Success/*
 {{OPT}}urEnqueueKernelLaunchTest.InvalidKernelArgs/*
+{{OPT}}urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest.Success/UseEventsNoQueuePerThread
+{{OPT}}urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest.Success/NoUseEventsNoQueuePerThread
 {{OPT}}urEnqueueKernelLaunchKernelWgSizeTest.Success/*
 {{OPT}}urEnqueueKernelLaunchKernelSubGroupTest.Success/*
 {{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/*__copy_2d_3d

From b3ebe18e37df120ff05b9987d92cfe7024730536 Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Wed, 27 Nov 2024 12:16:33 +0100
Subject: [PATCH 20/36] add tests for low-power events extension

---
 .../urEnqueueEventsWaitWithBarrier.cpp        | 74 ++++++++++++++-----
 test/conformance/queue/urQueueCreate.cpp      |  4 +-
 .../testing/include/uur/fixtures.h            | 22 ++++++
 3 files changed, 79 insertions(+), 21 deletions(-)

diff --git a/test/conformance/enqueue/urEnqueueEventsWaitWithBarrier.cpp b/test/conformance/enqueue/urEnqueueEventsWaitWithBarrier.cpp
index 8cf6401211..ca464b48dd 100644
--- a/test/conformance/enqueue/urEnqueueEventsWaitWithBarrier.cpp
+++ b/test/conformance/enqueue/urEnqueueEventsWaitWithBarrier.cpp
@@ -4,9 +4,30 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #include <uur/fixtures.h>
 
-struct urEnqueueEventsWaitWithBarrierTest : uur::urMultiQueueTest {
+enum class BarrierType {
+    Normal,
+    ExtLowPower,
+};
+
+std::ostream &operator<<(std::ostream &os, BarrierType barrierType) {
+    switch (barrierType) {
+    case BarrierType::Normal:
+        os << "Normal";
+        break;
+    case BarrierType::ExtLowPower:
+        os << "ExtLowPower";
+        break;
+    default:
+        os << "Unknown";
+        break;
+    }
+    return os;
+}
+
+struct urEnqueueEventsWaitWithBarrierTest
+    : uur::urMultiQueueTestWithParam<BarrierType> {
     void SetUp() override {
-        UUR_RETURN_ON_FATAL_FAILURE(urMultiQueueTest::SetUp());
+        UUR_RETURN_ON_FATAL_FAILURE(urMultiQueueTestWithParam::SetUp());
         ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_WRITE_ONLY, size,
                                          nullptr, &src_buffer));
         ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_ONLY, size,
@@ -24,7 +45,23 @@ struct urEnqueueEventsWaitWithBarrierTest : uur::urMultiQueueTest {
         if (dst_buffer) {
             EXPECT_SUCCESS(urMemRelease(dst_buffer));
         }
-        urMultiQueueTest::TearDown();
+        urMultiQueueTestWithParam::TearDown();
+    }
+
+    ur_result_t EnqueueBarrier(ur_queue_handle_t queue, uint32_t num_events,
+                               const ur_event_handle_t *event_list,
+                               ur_event_handle_t *wait_event) {
+        BarrierType barrier = getParam();
+        if (barrier == BarrierType::ExtLowPower) {
+            struct ur_exp_enqueue_ext_properties_t props = {
+                UR_STRUCTURE_TYPE_EXP_ENQUEUE_EXT_PROPERTIES, nullptr,
+                UR_EXP_ENQUEUE_EXT_FLAG_LOW_POWER_EVENTS};
+            return urEnqueueEventsWaitWithBarrierExt(queue, &props, num_events,
+                                                     event_list, wait_event);
+        }
+
+        return urEnqueueEventsWaitWithBarrier(queue, num_events, event_list,
+                                              wait_event);
     }
 
     const size_t count = 1024;
@@ -34,7 +71,10 @@ struct urEnqueueEventsWaitWithBarrierTest : uur::urMultiQueueTest {
     std::vector<uint32_t> input;
 };
 
-UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueEventsWaitWithBarrierTest);
+UUR_TEST_SUITE_P(urEnqueueEventsWaitWithBarrierTest,
+                 ::testing::Values(BarrierType::Normal,
+                                   BarrierType::ExtLowPower),
+                 uur::deviceTestWithParamPrinter<BarrierType>);
 
 struct urEnqueueEventsWaitWithBarrierOrderingTest : uur::urProgramTest {
     void SetUp() override {
@@ -67,8 +107,7 @@ TEST_P(urEnqueueEventsWaitWithBarrierTest, Success) {
     ur_event_handle_t waitEvent = nullptr;
     ASSERT_SUCCESS(urEnqueueMemBufferCopy(queue1, src_buffer, dst_buffer, 0, 0,
                                           size, 0, nullptr, &event1));
-    EXPECT_SUCCESS(
-        urEnqueueEventsWaitWithBarrier(queue2, 1, &event1, &waitEvent));
+    EXPECT_SUCCESS(EnqueueBarrier(queue2, 1, &event1, &waitEvent));
     EXPECT_SUCCESS(urQueueFlush(queue2));
     EXPECT_SUCCESS(urQueueFlush(queue1));
     EXPECT_SUCCESS(urEventWait(1, &waitEvent));
@@ -86,8 +125,7 @@ TEST_P(urEnqueueEventsWaitWithBarrierTest, Success) {
                                            input.data(), 0, nullptr, nullptr));
     EXPECT_SUCCESS(urEnqueueMemBufferCopy(queue2, src_buffer, dst_buffer, 0, 0,
                                           size, 0, nullptr, &event2));
-    EXPECT_SUCCESS(
-        urEnqueueEventsWaitWithBarrier(queue1, 1, &event2, &waitEvent));
+    EXPECT_SUCCESS(EnqueueBarrier(queue1, 1, &event2, &waitEvent));
     EXPECT_SUCCESS(urQueueFlush(queue2));
     EXPECT_SUCCESS(urQueueFlush(queue1));
     EXPECT_SUCCESS(urEventWait(1, &waitEvent));
@@ -99,27 +137,23 @@ TEST_P(urEnqueueEventsWaitWithBarrierTest, Success) {
 }
 
 TEST_P(urEnqueueEventsWaitWithBarrierTest, InvalidNullHandleQueue) {
-    ASSERT_EQ_RESULT(
-        UR_RESULT_ERROR_INVALID_NULL_HANDLE,
-        urEnqueueEventsWaitWithBarrier(nullptr, 0, nullptr, nullptr));
+    ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE,
+                     EnqueueBarrier(nullptr, 0, nullptr, nullptr));
 }
 
 TEST_P(urEnqueueEventsWaitWithBarrierTest, InvalidNullPtrEventWaitList) {
-    ASSERT_EQ_RESULT(
-        urEnqueueEventsWaitWithBarrier(queue1, 1, nullptr, nullptr),
-        UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
+    ASSERT_EQ_RESULT(EnqueueBarrier(queue1, 1, nullptr, nullptr),
+                     UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
 
     ur_event_handle_t validEvent;
     ASSERT_SUCCESS(urEnqueueEventsWait(queue1, 0, nullptr, &validEvent));
 
-    ASSERT_EQ_RESULT(
-        urEnqueueEventsWaitWithBarrier(queue1, 0, &validEvent, nullptr),
-        UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
+    ASSERT_EQ_RESULT(EnqueueBarrier(queue1, 0, &validEvent, nullptr),
+                     UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
 
     ur_event_handle_t inv_evt = nullptr;
-    ASSERT_EQ_RESULT(
-        urEnqueueEventsWaitWithBarrier(queue1, 1, &inv_evt, nullptr),
-        UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
+    ASSERT_EQ_RESULT(EnqueueBarrier(queue1, 1, &inv_evt, nullptr),
+                     UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
 
     ASSERT_SUCCESS(urEventRelease(validEvent));
 }
diff --git a/test/conformance/queue/urQueueCreate.cpp b/test/conformance/queue/urQueueCreate.cpp
index ad0957d747..8b26784ef5 100644
--- a/test/conformance/queue/urQueueCreate.cpp
+++ b/test/conformance/queue/urQueueCreate.cpp
@@ -2,6 +2,7 @@
 // Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "ur_api.h"
 #include "uur/raii.h"
 #include <uur/fixtures.h>
 
@@ -34,7 +35,8 @@ UUR_TEST_SUITE_P(urQueueCreateWithParamTest,
                                  UR_QUEUE_FLAG_SUBMISSION_BATCHED,
                                  UR_QUEUE_FLAG_SUBMISSION_IMMEDIATE,
                                  UR_QUEUE_FLAG_USE_DEFAULT_STREAM,
-                                 UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM),
+                                 UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM,
+                                 UR_QUEUE_FLAG_LOW_POWER_EVENTS_EXP),
                  uur::deviceTestWithParamPrinter<ur_queue_flag_t>);
 
 TEST_P(urQueueCreateWithParamTest, SuccessWithProperties) {
diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h
index 436e7821a9..1900568292 100644
--- a/test/conformance/testing/include/uur/fixtures.h
+++ b/test/conformance/testing/include/uur/fixtures.h
@@ -519,6 +519,28 @@ struct urMultiQueueTest : urContextTest {
     ur_queue_handle_t queue2 = nullptr;
 };
 
+template <class T>
+struct urMultiQueueTestWithParam : urContextTestWithParam<T> {
+    void SetUp() override {
+        UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam<T>::SetUp());
+        ASSERT_SUCCESS(urQueueCreate(this->context, this->device, 0, &queue1));
+        ASSERT_SUCCESS(urQueueCreate(this->context, this->device, 0, &queue2));
+    }
+
+    void TearDown() override {
+        if (queue1 != nullptr) {
+            EXPECT_SUCCESS(urQueueRelease(queue1));
+        }
+        if (queue2 != nullptr) {
+            EXPECT_SUCCESS(urQueueRelease(queue2));
+        }
+        UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam<T>::TearDown());
+    }
+
+    ur_queue_handle_t queue1 = nullptr;
+    ur_queue_handle_t queue2 = nullptr;
+};
+
 template <size_t MinDevices = 2>
 struct urMultiDeviceContextTestTemplate : urPlatformTest {
     void SetUp() override {

From d08ee7e948855044d7b7db39fc258c763f3a5543 Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Wed, 27 Nov 2024 12:39:30 +0100
Subject: [PATCH 21/36] add missing pfn assignments for
 urEnqueueEventsWaitWithBarrierExt

---
 source/adapters/cuda/ur_interface_loader.cpp       | 1 +
 source/adapters/hip/ur_interface_loader.cpp        | 1 +
 source/adapters/native_cpu/ur_interface_loader.cpp | 1 +
 source/adapters/opencl/ur_interface_loader.cpp     | 1 +
 4 files changed, 4 insertions(+)

diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp
index a9559eb188..4b13e6669c 100644
--- a/source/adapters/cuda/ur_interface_loader.cpp
+++ b/source/adapters/cuda/ur_interface_loader.cpp
@@ -174,6 +174,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
   pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite;
   pDdiTable->pfnEventsWait = urEnqueueEventsWait;
   pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier;
+  pDdiTable->pfnEventsWaitWithBarrierExt = urEnqueueEventsWaitWithBarrierExt;
   pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch;
   pDdiTable->pfnMemBufferCopy = urEnqueueMemBufferCopy;
   pDdiTable->pfnMemBufferCopyRect = urEnqueueMemBufferCopyRect;
diff --git a/source/adapters/hip/ur_interface_loader.cpp b/source/adapters/hip/ur_interface_loader.cpp
index 1454ddfdf1..f7ec09188f 100644
--- a/source/adapters/hip/ur_interface_loader.cpp
+++ b/source/adapters/hip/ur_interface_loader.cpp
@@ -174,6 +174,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
   pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite;
   pDdiTable->pfnEventsWait = urEnqueueEventsWait;
   pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier;
+  pDdiTable->pfnEventsWaitWithBarrierExt = urEnqueueEventsWaitWithBarrierExt;
   pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch;
   pDdiTable->pfnMemBufferCopy = urEnqueueMemBufferCopy;
   pDdiTable->pfnMemBufferCopyRect = urEnqueueMemBufferCopyRect;
diff --git a/source/adapters/native_cpu/ur_interface_loader.cpp b/source/adapters/native_cpu/ur_interface_loader.cpp
index 94c6c4a03e..9717f020c3 100644
--- a/source/adapters/native_cpu/ur_interface_loader.cpp
+++ b/source/adapters/native_cpu/ur_interface_loader.cpp
@@ -172,6 +172,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
   pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite;
   pDdiTable->pfnEventsWait = urEnqueueEventsWait;
   pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier;
+  pDdiTable->pfnEventsWaitWithBarrierExt = urEnqueueEventsWaitWithBarrierExt;
   pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch;
   pDdiTable->pfnMemBufferCopy = urEnqueueMemBufferCopy;
   pDdiTable->pfnMemBufferCopyRect = urEnqueueMemBufferCopyRect;
diff --git a/source/adapters/opencl/ur_interface_loader.cpp b/source/adapters/opencl/ur_interface_loader.cpp
index cba90ee152..46d2bf6cdd 100644
--- a/source/adapters/opencl/ur_interface_loader.cpp
+++ b/source/adapters/opencl/ur_interface_loader.cpp
@@ -174,6 +174,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
   pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite;
   pDdiTable->pfnEventsWait = urEnqueueEventsWait;
   pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier;
+  pDdiTable->pfnEventsWaitWithBarrierExt = urEnqueueEventsWaitWithBarrierExt;
   pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch;
   pDdiTable->pfnMemBufferCopy = urEnqueueMemBufferCopy;
   pDdiTable->pfnMemBufferCopyRect = urEnqueueMemBufferCopyRect;

From 46832dfddb5786c00e29287731d1a5e977e24c75 Mon Sep 17 00:00:00 2001
From: Krzysztof Swiecicki <krzysztof.swiecicki@intel.com>
Date: Thu, 28 Nov 2024 11:43:50 +0100
Subject: [PATCH 22/36] [L0] Default initialize the adapter results variables

The optional value for ZeResult can be checked to confirm whether
initialization already happened. The rest of the result variables don't
have this purpose and don't need to be wrapped with std::optional<>.
---
 source/adapters/level_zero/adapter.cpp | 18 ++++++++++--------
 source/adapters/level_zero/adapter.hpp |  6 +++---
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/source/adapters/level_zero/adapter.cpp b/source/adapters/level_zero/adapter.cpp
index 8995a5e25b..f2ac67ddff 100644
--- a/source/adapters/level_zero/adapter.cpp
+++ b/source/adapters/level_zero/adapter.cpp
@@ -256,6 +256,9 @@ Behavior Summary:
 */
 ur_adapter_handle_t_::ur_adapter_handle_t_()
     : logger(logger::get_logger("level_zero")) {
+  ZeInitDriversResult = ZE_RESULT_ERROR_UNINITIALIZED;
+  ZeInitResult = ZE_RESULT_ERROR_UNINITIALIZED;
+  ZesResult = ZE_RESULT_ERROR_UNINITIALIZED;
 
   if (UrL0Debug & UR_L0_DEBUG_BASIC) {
     logger.setLegacySink(std::make_unique<ur_legacy_sink>());
@@ -331,9 +334,8 @@ ur_adapter_handle_t_::ur_adapter_handle_t_()
       logger::debug("\nzeInit with flags value of {}\n",
                     static_cast<int>(L0InitFlags));
       GlobalAdapter->ZeInitResult = ZE_CALL_NOCHECK(zeInit, (L0InitFlags));
-      if (*GlobalAdapter->ZeInitResult != ZE_RESULT_SUCCESS) {
-        logger::debug("\nzeInit failed with {}\n",
-                      *GlobalAdapter->ZeInitResult);
+      if (GlobalAdapter->ZeInitResult != ZE_RESULT_SUCCESS) {
+        logger::debug("\nzeInit failed with {}\n", GlobalAdapter->ZeInitResult);
       }
 
       bool useInitDrivers = false;
@@ -376,17 +378,17 @@ ur_adapter_handle_t_::ur_adapter_handle_t_()
               ZE_CALL_NOCHECK(GlobalAdapter->initDriversFunctionPtr,
                               (&GlobalAdapter->ZeInitDriversCount, nullptr,
                                &GlobalAdapter->InitDriversDesc));
-          if (*GlobalAdapter->ZeInitDriversResult == ZE_RESULT_SUCCESS) {
+          if (GlobalAdapter->ZeInitDriversResult == ZE_RESULT_SUCCESS) {
             GlobalAdapter->InitDriversSupported = true;
           } else {
             logger::debug("\nzeInitDrivers failed with {}\n",
-                          *GlobalAdapter->ZeInitDriversResult);
+                          GlobalAdapter->ZeInitDriversResult);
           }
         }
       }
 
-      if (*GlobalAdapter->ZeInitResult == ZE_RESULT_SUCCESS ||
-          *GlobalAdapter->ZeInitDriversResult == ZE_RESULT_SUCCESS) {
+      if (GlobalAdapter->ZeInitResult == ZE_RESULT_SUCCESS ||
+          GlobalAdapter->ZeInitDriversResult == ZE_RESULT_SUCCESS) {
         GlobalAdapter->ZeResult = ZE_RESULT_SUCCESS;
       } else {
         GlobalAdapter->ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
@@ -450,7 +452,7 @@ ur_adapter_handle_t_::ur_adapter_handle_t_()
       GlobalAdapter->ZesResult = ZE_RESULT_ERROR_UNINITIALIZED;
     }
 
-    ur_result_t err = initPlatforms(platforms, *GlobalAdapter->ZesResult);
+    ur_result_t err = initPlatforms(platforms, GlobalAdapter->ZesResult);
     if (err == UR_RESULT_SUCCESS) {
       result = std::move(platforms);
     } else {
diff --git a/source/adapters/level_zero/adapter.hpp b/source/adapters/level_zero/adapter.hpp
index 277d2334b9..c41f671d9b 100644
--- a/source/adapters/level_zero/adapter.hpp
+++ b/source/adapters/level_zero/adapter.hpp
@@ -39,10 +39,10 @@ struct ur_adapter_handle_t_ {
   uint32_t ZeInitDriversCount = 0;
   bool InitDriversSupported = false;
 
-  std::optional<ze_result_t> ZeInitDriversResult;
-  std::optional<ze_result_t> ZeInitResult;
+  ze_result_t ZeInitDriversResult;
+  ze_result_t ZeInitResult;
+  ze_result_t ZesResult;
   std::optional<ze_result_t> ZeResult;
-  std::optional<ze_result_t> ZesResult;
   ZeCache<Result<PlatformVec>> PlatformCache;
   logger::Logger &logger;
   HMODULE processHandle = nullptr;

From 4844a1846dfad5beac4bf50e433e3429396f568c Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Fri, 29 Nov 2024 11:34:04 +0000
Subject: [PATCH 23/36] Disable CUDA sycl e2e tests

They seem to cause hangs in our CI.

Relates to: #2398
---
 .github/workflows/cmake.yml | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 2912475272..0a4ae99a58 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -221,13 +221,14 @@ jobs:
     needs: [ubuntu-build, opencl]
     uses: ./.github/workflows/e2e_opencl.yml
 
-  e2e-cuda:
-    name: E2E CUDA
-    permissions:
-      contents: read
-      pull-requests: write
-    needs: [ubuntu-build, cuda]
-    uses: ./.github/workflows/e2e_cuda.yml
+  # Causes hangs: https://github.com/oneapi-src/unified-runtime/issues/2398
+  #e2e-cuda:
+  #  name: E2E CUDA
+  #  permissions:
+  #    contents: read
+  #    pull-requests: write
+  #  needs: [ubuntu-build, cuda]
+  #  uses: ./.github/workflows/e2e_cuda.yml
 
   windows-build:
     name: Build - Windows

From 28dc17314d2d670186b1bee62f02be65826f6bc0 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Fri, 29 Nov 2024 14:12:21 +0000
Subject: [PATCH 24/36] Revert "Add new launch property to support
 work_group_scratch_memory"

This reverts commit 7222f79b160681434250bac74f3aae0e4abc46f0.
---
 include/ur_api.h                              |  15 +--
 include/ur_ddi.h                              |   1 -
 include/ur_print.hpp                          |  16 ---
 scripts/core/exp-launch-properties.yml        |  13 +-
 source/adapters/cuda/enqueue.cpp              | 111 +++++++-----------
 source/adapters/cuda/kernel.hpp               |  45 ++-----
 source/adapters/level_zero/queue.cpp          |   5 +-
 .../level_zero/ur_interface_loader.hpp        |   4 +-
 source/adapters/level_zero/v2/queue_api.cpp   |   6 +-
 source/adapters/level_zero/v2/queue_api.hpp   |   6 +-
 .../v2/queue_immediate_in_order.cpp           |   4 +-
 .../v2/queue_immediate_in_order.hpp           |   4 +-
 source/adapters/mock/ur_mockddi.cpp           |  23 +---
 source/loader/layers/tracing/ur_trcddi.cpp    |  25 ++--
 source/loader/layers/validation/ur_valddi.cpp |  13 +-
 source/loader/ur_ldrddi.cpp                   |  35 +-----
 source/loader/ur_libapi.cpp                   |  12 +-
 source/ur_api.cpp                             |   4 -
 .../launch_properties.cpp                     |   4 +-
 19 files changed, 94 insertions(+), 252 deletions(-)

diff --git a/include/ur_api.h b/include/ur_api.h
index 1de876cb7f..eb8b07221c 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -9560,7 +9560,6 @@ typedef enum ur_exp_launch_property_id_t {
     UR_EXP_LAUNCH_PROPERTY_ID_IGNORE = 0,            ///< The property has no effect
     UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE = 1,       ///< Whether to launch a cooperative kernel
     UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION = 2, ///< work-group cluster dimensions
-    UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY = 3, ///< Implicit work group memory allocation
     /// @cond
     UR_EXP_LAUNCH_PROPERTY_ID_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -9574,12 +9573,10 @@ typedef enum ur_exp_launch_property_id_t {
 ///   _Analogues_
 ///     - **CUlaunchAttributeValue**
 typedef union ur_exp_launch_property_value_t {
-    uint32_t clusterDim[3];    ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
-                               ///< value must be a divisor of the corresponding global work-size
-                               ///< dimension (in units of work-group).
-    int cooperative;           ///< [in] non-zero value indicates a cooperative kernel
-    size_t workgroup_mem_size; ///< [in] non-zero value indicates the amount of work group memory to
-                               ///< allocate in bytes
+    uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
+                            ///< value must be a divisor of the corresponding global work-size
+                            ///< dimension (in units of work-group).
+    int cooperative;        ///< [in] non-zero value indicates a cooperative kernel
 
 } ur_exp_launch_property_value_t;
 
@@ -9620,7 +9617,6 @@ typedef struct ur_exp_launch_property_t {
 ///         + NULL == hQueue
 ///         + NULL == hKernel
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `NULL == pGlobalWorkOffset`
 ///         + `NULL == pGlobalWorkSize`
 ///         + `NULL == launchPropList`
 ///         + NULL == pGlobalWorkSize
@@ -9649,8 +9645,6 @@ urEnqueueKernelLaunchCustomExp(
     ur_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
     uint32_t workDim,                               ///< [in] number of dimensions, from 1 to 3, to specify the global and
                                                     ///< work-group work-items
-    const size_t *pGlobalWorkOffset,                ///< [in] pointer to an array of workDim unsigned values that specify the
-                                                    ///< offset used to calculate the global ID of a work-item
     const size_t *pGlobalWorkSize,                  ///< [in] pointer to an array of workDim unsigned values that specify the
                                                     ///< number of global work-items in workDim that will execute the kernel
                                                     ///< function
@@ -11560,7 +11554,6 @@ typedef struct ur_enqueue_kernel_launch_custom_exp_params_t {
     ur_queue_handle_t *phQueue;
     ur_kernel_handle_t *phKernel;
     uint32_t *pworkDim;
-    const size_t **ppGlobalWorkOffset;
     const size_t **ppGlobalWorkSize;
     const size_t **ppLocalWorkSize;
     uint32_t *pnumPropsInLaunchPropList;
diff --git a/include/ur_ddi.h b/include/ur_ddi.h
index cdf90eda6d..40a6c5c269 100644
--- a/include/ur_ddi.h
+++ b/include/ur_ddi.h
@@ -1467,7 +1467,6 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchCustomExp_t)(
     uint32_t,
     const size_t *,
     const size_t *,
-    const size_t *,
     uint32_t,
     const ur_exp_launch_property_t *,
     uint32_t,
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index 190d3f9cd5..8888a74f91 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -10397,9 +10397,6 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_exp_launch_property_id
     case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION:
         os << "UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION";
         break;
-    case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
-        os << "UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY";
-        break;
     default:
         os << "unknown enumerator";
         break;
@@ -10436,13 +10433,6 @@ inline ur_result_t printUnion(
 
         os << (params.cooperative);
 
-        break;
-    case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
-
-        os << ".workgroup_mem_size = ";
-
-        os << (params.workgroup_mem_size);
-
         break;
     default:
         os << "<unknown>";
@@ -15110,12 +15100,6 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
 
     os << *(params->pworkDim);
 
-    os << ", ";
-    os << ".pGlobalWorkOffset = ";
-
-    ur::details::printPtr(os,
-                          *(params->ppGlobalWorkOffset));
-
     os << ", ";
     os << ".pGlobalWorkSize = ";
 
diff --git a/scripts/core/exp-launch-properties.yml b/scripts/core/exp-launch-properties.yml
index ca28421815..9e66e9ea06 100644
--- a/scripts/core/exp-launch-properties.yml
+++ b/scripts/core/exp-launch-properties.yml
@@ -29,8 +29,6 @@ etors:
       desc: "Whether to launch a cooperative kernel"
     - name: CLUSTER_DIMENSION
       desc: "work-group cluster dimensions"
-    - name: WORK_GROUP_MEMORY
-      desc: "Implicit work group memory allocation"
 --- #--------------------------------------------------------------------------
 type: union
 desc: "Specifies a launch property value"
@@ -47,10 +45,6 @@ members:
       name: cooperative
       desc: "[in] non-zero value indicates a cooperative kernel"
       tag: $X_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE
-    - type: size_t
-      name: workgroup_mem_size
-      desc: "[in] non-zero value indicates the amount of work group memory to allocate in bytes"
-      tag: $X_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY
 --- #--------------------------------------------------------------------------
 type: struct
 desc: "Kernel launch property"
@@ -88,9 +82,6 @@ params:
     - type: uint32_t
       name: workDim
       desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items"
-    - type: "const size_t*"
-      name: pGlobalWorkOffset
-      desc: "[in] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item"
     - type: const size_t*
       name: pGlobalWorkSize
       desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function"
@@ -106,10 +97,10 @@ params:
     - type: uint32_t
       name: numEventsInWaitList
       desc: "[in] size of the event wait list"
-    - type: const $x_event_handle_t*
+    - type: const ur_event_handle_t*
       name: phEventWaitList
       desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. "
-    - type: $x_event_handle_t*
+    - type: ur_event_handle_t*
       name: phEvent
       desc: "[out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array."
 returns:
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index a3fc58dfb9..fc3d0220e8 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -422,13 +422,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
                                         phEventWaitList, phEvent);
 }
 
-static ur_result_t
-enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
-                    uint32_t workDim, const size_t *pGlobalWorkOffset,
-                    const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
-                    uint32_t numEventsInWaitList,
-                    const ur_event_handle_t *phEventWaitList,
-                    ur_event_handle_t *phEvent, size_t WorkGroupMemory) {
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
+    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
   // Preconditions
   UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(),
             UR_RESULT_ERROR_INVALID_KERNEL);
@@ -446,9 +444,6 @@ enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
   size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
   size_t BlocksPerGrid[3] = {1u, 1u, 1u};
 
-  // Set work group memory so we can compute the whole memory requirement
-  if (WorkGroupMemory)
-    hKernel->setWorkGroupMemory(WorkGroupMemory);
   uint32_t LocalSize = hKernel->getLocalSize();
   CUfunction CuFunc = hKernel->get();
 
@@ -511,17 +506,6 @@ enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
   return UR_RESULT_SUCCESS;
 }
 
-UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
-    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
-    const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
-    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
-  return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
-                             pGlobalWorkSize, pLocalWorkSize,
-                             numEventsInWaitList, phEventWaitList, phEvent,
-                             /*WorkGroupMemory=*/0);
-}
-
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
     const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
@@ -532,9 +516,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
     coop_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE;
     coop_prop.value.cooperative = 1;
     return urEnqueueKernelLaunchCustomExp(
-        hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
-        pLocalWorkSize, 1, &coop_prop, numEventsInWaitList, phEventWaitList,
-        phEvent);
+        hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, 1,
+        &coop_prop, numEventsInWaitList, phEventWaitList, phEvent);
   }
   return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
                                pGlobalWorkSize, pLocalWorkSize,
@@ -543,29 +526,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
-    const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
+    const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
+    uint32_t numPropsInLaunchPropList,
     const ur_exp_launch_property_t *launchPropList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
 
-  size_t WorkGroupMemory = [&]() -> size_t {
-    const ur_exp_launch_property_t *WorkGroupMemoryProp = std::find_if(
-        launchPropList, launchPropList + numPropsInLaunchPropList,
-        [](const ur_exp_launch_property_t &Prop) {
-          return Prop.id == UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY;
-        });
-    if (WorkGroupMemoryProp != launchPropList + numPropsInLaunchPropList)
-      return WorkGroupMemoryProp->value.workgroup_mem_size;
-    return 0;
-  }();
-
-  if (numPropsInLaunchPropList == 0 ||
-      (WorkGroupMemory && numPropsInLaunchPropList == 1)) {
-    return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
-                               pGlobalWorkSize, pLocalWorkSize,
-                               numEventsInWaitList, phEventWaitList, phEvent,
-                               WorkGroupMemory);
+  if (numPropsInLaunchPropList == 0) {
+    urEnqueueKernelLaunch(hQueue, hKernel, workDim, nullptr, pGlobalWorkSize,
+                          pLocalWorkSize, numEventsInWaitList, phEventWaitList,
+                          phEvent);
   }
 #if CUDA_VERSION >= 11080
   // Preconditions
@@ -578,8 +548,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     return UR_RESULT_ERROR_INVALID_NULL_POINTER;
   }
 
-  std::vector<CUlaunchAttribute> launch_attribute;
-  launch_attribute.reserve(numPropsInLaunchPropList);
+  std::vector<CUlaunchAttribute> launch_attribute(numPropsInLaunchPropList);
 
   // Early exit for zero size kernel
   if (*pGlobalWorkSize == 0) {
@@ -592,35 +561,40 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
   size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
   size_t BlocksPerGrid[3] = {1u, 1u, 1u};
 
-  // Set work group memory so we can compute the whole memory requirement
-  if (WorkGroupMemory)
-    hKernel->setWorkGroupMemory(WorkGroupMemory);
   uint32_t LocalSize = hKernel->getLocalSize();
   CUfunction CuFunc = hKernel->get();
 
   for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) {
     switch (launchPropList[i].id) {
     case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: {
-      auto &attr = launch_attribute.emplace_back();
-      attr.id = CU_LAUNCH_ATTRIBUTE_IGNORE;
+      launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_IGNORE;
       break;
     }
     case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: {
-      auto &attr = launch_attribute.emplace_back();
-      attr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
+
+      launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
       // Note that cuda orders from right to left wrt SYCL dimensional order.
       if (workDim == 3) {
-        attr.value.clusterDim.x = launchPropList[i].value.clusterDim[2];
-        attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1];
-        attr.value.clusterDim.z = launchPropList[i].value.clusterDim[0];
+        launch_attribute[i].value.clusterDim.x =
+            launchPropList[i].value.clusterDim[2];
+        launch_attribute[i].value.clusterDim.y =
+            launchPropList[i].value.clusterDim[1];
+        launch_attribute[i].value.clusterDim.z =
+            launchPropList[i].value.clusterDim[0];
       } else if (workDim == 2) {
-        attr.value.clusterDim.x = launchPropList[i].value.clusterDim[1];
-        attr.value.clusterDim.y = launchPropList[i].value.clusterDim[0];
-        attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2];
+        launch_attribute[i].value.clusterDim.x =
+            launchPropList[i].value.clusterDim[1];
+        launch_attribute[i].value.clusterDim.y =
+            launchPropList[i].value.clusterDim[0];
+        launch_attribute[i].value.clusterDim.z =
+            launchPropList[i].value.clusterDim[2];
       } else {
-        attr.value.clusterDim.x = launchPropList[i].value.clusterDim[0];
-        attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1];
-        attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2];
+        launch_attribute[i].value.clusterDim.x =
+            launchPropList[i].value.clusterDim[0];
+        launch_attribute[i].value.clusterDim.y =
+            launchPropList[i].value.clusterDim[1];
+        launch_attribute[i].value.clusterDim.z =
+            launchPropList[i].value.clusterDim[2];
       }
 
       UR_CHECK_ERROR(cuFuncSetAttribute(
@@ -629,12 +603,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
       break;
     }
     case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: {
-      auto &attr = launch_attribute.emplace_back();
-      attr.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
-      attr.value.cooperative = launchPropList[i].value.cooperative;
-      break;
-    }
-    case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: {
+      launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
+      launch_attribute[i].value.cooperative =
+          launchPropList[i].value.cooperative;
       break;
     }
     default: {
@@ -647,8 +618,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
   // using the standard UR_CHECK_ERROR
   if (ur_result_t Ret =
           setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
-                          pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
-                          hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
+                          nullptr, pGlobalWorkSize, pLocalWorkSize, hKernel,
+                          CuFunc, ThreadsPerBlock, BlocksPerGrid);
       Ret != UR_RESULT_SUCCESS)
     return Ret;
 
@@ -696,7 +667,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     launch_config.sharedMemBytes = LocalSize;
     launch_config.hStream = CuStream;
     launch_config.attrs = &launch_attribute[0];
-    launch_config.numAttrs = launch_attribute.size();
+    launch_config.numAttrs = numPropsInLaunchPropList;
 
     UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc,
                                     const_cast<void **>(ArgIndices.data()),
diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp
index c3b9fae09a..7ad20a4f0e 100644
--- a/source/adapters/cuda/kernel.hpp
+++ b/source/adapters/cuda/kernel.hpp
@@ -65,8 +65,6 @@ struct ur_kernel_handle_t_ {
     args_size_t ParamSizes;
     args_index_t Indices;
     args_size_t OffsetPerIndex;
-    size_t WorkGroupMemory = 0;
-
     // A struct to keep track of memargs so that we can do dependency analysis
     // at urEnqueueKernelLaunch
     struct mem_obj_arg {
@@ -107,28 +105,22 @@ struct ur_kernel_handle_t_ {
       OffsetPerIndex[Index] = LocalSize;
     }
 
-    // maximum required alignment is the size of the largest vector type
-    static constexpr size_t MaxAlignment = sizeof(double) * 16;
+    void addLocalArg(size_t Index, size_t Size) {
+      size_t LocalOffset = this->getLocalSize();
+
+      // maximum required alignment is the size of the largest vector type
+      const size_t MaxAlignment = sizeof(double) * 16;
 
-    static size_t alignMemoryAllocation(size_t Size, size_t Offset) {
       // for arguments smaller than the maximum alignment simply align to the
       // size of the argument
       const size_t Alignment = std::min(MaxAlignment, Size);
 
       // align the argument
-      size_t AlignedLocalOffset = Offset;
-      size_t Pad = Offset % Alignment;
+      size_t AlignedLocalOffset = LocalOffset;
+      size_t Pad = LocalOffset % Alignment;
       if (Pad != 0) {
         AlignedLocalOffset += Alignment - Pad;
       }
-      return AlignedLocalOffset;
-    }
-
-    void addLocalArg(size_t Index, size_t Size) {
-      size_t LocalOffset = this->getLocalSize();
-
-      // align the argument
-      size_t AlignedLocalOffset = alignMemoryAllocation(Size, LocalOffset);
 
       addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
              Size + (AlignedLocalOffset - LocalOffset));
@@ -148,24 +140,6 @@ struct ur_kernel_handle_t_ {
       MemObjArgs.push_back(arguments::mem_obj_arg{hMem, Index, Flags});
     }
 
-    void setWorkGroupMemory(size_t MemSize) {
-      assert(WorkGroupMemory == 0 &&
-             "Work Group Memory size can only be set once");
-      // Ensure first offset is MaxAlignment aligned
-      WorkGroupMemory = alignMemoryAllocation(MaxAlignment, MemSize);
-
-      // Adjust local accessor setting
-      // the dynamic memory will start at offset 0 (allows us to keep accessing
-      // local memory as a GV) and accessors will use the rest of the range
-      for (size_t i = 0; i < OffsetPerIndex.size(); i++) {
-        // if offset is 0, it is not a local accessor argument.
-        if (!OffsetPerIndex[i])
-          continue;
-        assert(ParamSizes[i] == sizeof(size_t) && "Offset should be a size_t");
-        *reinterpret_cast<size_t *>(Indices[i]) += WorkGroupMemory;
-      }
-    }
-
     void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) {
       assert(Size == sizeof(std::uint32_t) * 3);
       std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size);
@@ -173,15 +147,13 @@ struct ur_kernel_handle_t_ {
 
     void clearLocalSize() {
       std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0);
-      WorkGroupMemory = 0;
     }
 
     const args_index_t &getIndices() const noexcept { return Indices; }
 
     uint32_t getLocalSize() const {
       return std::accumulate(std::begin(OffsetPerIndex),
-                             std::end(OffsetPerIndex), 0) +
-             WorkGroupMemory;
+                             std::end(OffsetPerIndex), 0);
     }
   } Args;
 
@@ -266,7 +238,6 @@ struct ur_kernel_handle_t_ {
     return Args.getIndices();
   }
 
-  void setWorkGroupMemory(size_t MemSize) { Args.setWorkGroupMemory(MemSize); }
   uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); }
 
   void clearLocalSize() { Args.clearLocalSize(); }
diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index 95c8d026a7..c4598f3472 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -902,15 +902,14 @@ ur_result_t urQueueFlush(
 
 ur_result_t urEnqueueKernelLaunchCustomExp(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
-    const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
+    const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
+    uint32_t numPropsInLaunchPropList,
     const ur_exp_launch_property_t *launchPropList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
   std::ignore = hQueue;
   std::ignore = hKernel;
   std::ignore = workDim;
-  std::ignore = pGlobalWorkOffset;
   std::ignore = pGlobalWorkSize;
   std::ignore = pLocalWorkSize;
   std::ignore = numPropsInLaunchPropList;
diff --git a/source/adapters/level_zero/ur_interface_loader.hpp b/source/adapters/level_zero/ur_interface_loader.hpp
index 0832303b50..1215d6449e 100644
--- a/source/adapters/level_zero/ur_interface_loader.hpp
+++ b/source/adapters/level_zero/ur_interface_loader.hpp
@@ -694,8 +694,8 @@ ur_result_t urEnqueueTimestampRecordingExp(
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent);
 ur_result_t urEnqueueKernelLaunchCustomExp(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
-    const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
+    const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
+    uint32_t numPropsInLaunchPropList,
     const ur_exp_launch_property_t *launchPropList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent);
diff --git a/source/adapters/level_zero/v2/queue_api.cpp b/source/adapters/level_zero/v2/queue_api.cpp
index e4659b5f2c..b7b45625a2 100644
--- a/source/adapters/level_zero/v2/queue_api.cpp
+++ b/source/adapters/level_zero/v2/queue_api.cpp
@@ -391,13 +391,13 @@ ur_result_t urEnqueueTimestampRecordingExp(
 }
 ur_result_t urEnqueueKernelLaunchCustomExp(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
-    const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
+    const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
+    uint32_t numPropsInLaunchPropList,
     const ur_exp_launch_property_t *launchPropList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) try {
   return hQueue->enqueueKernelLaunchCustomExp(
-      hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
+      hKernel, workDim, pGlobalWorkSize, pLocalWorkSize,
       numPropsInLaunchPropList, launchPropList, numEventsInWaitList,
       phEventWaitList, phEvent);
 } catch (...) {
diff --git a/source/adapters/level_zero/v2/queue_api.hpp b/source/adapters/level_zero/v2/queue_api.hpp
index c59f084fc4..7cb039ccdd 100644
--- a/source/adapters/level_zero/v2/queue_api.hpp
+++ b/source/adapters/level_zero/v2/queue_api.hpp
@@ -144,9 +144,9 @@ struct ur_queue_handle_t_ {
                                                    const ur_event_handle_t *,
                                                    ur_event_handle_t *) = 0;
   virtual ur_result_t enqueueKernelLaunchCustomExp(
-      ur_kernel_handle_t, uint32_t, const size_t *, const size_t *,
-      const size_t *, uint32_t, const ur_exp_launch_property_t *, uint32_t,
-      const ur_event_handle_t *, ur_event_handle_t *) = 0;
+      ur_kernel_handle_t, uint32_t, const size_t *, const size_t *, uint32_t,
+      const ur_exp_launch_property_t *, uint32_t, const ur_event_handle_t *,
+      ur_event_handle_t *) = 0;
   virtual ur_result_t
   enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *,
                                   uint32_t, const ur_event_handle_t *,
diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
index 05e48c8740..519b0ffc1e 100644
--- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
+++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
@@ -1069,15 +1069,13 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp(
 }
 
 ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunchCustomExp(
-    ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkSize,
     const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
     const ur_exp_launch_property_t *launchPropList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
   std::ignore = hKernel;
   std::ignore = workDim;
-  std::ignore = pGlobalWorkOffset;
   std::ignore = pGlobalWorkSize;
   std::ignore = pLocalWorkSize;
   std::ignore = numPropsInLaunchPropList;
diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
index bdd3009d63..33e060ded3 100644
--- a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
+++ b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
@@ -263,8 +263,8 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ {
                                ur_event_handle_t *phEvent) override;
   ur_result_t enqueueKernelLaunchCustomExp(
       ur_kernel_handle_t hKernel, uint32_t workDim,
-      const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
-      const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
+      const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
+      uint32_t numPropsInLaunchPropList,
       const ur_exp_launch_property_t *launchPropList,
       uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
       ur_event_handle_t *phEvent) override;
diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp
index c8ce408756..42c342444d 100644
--- a/source/adapters/mock/ur_mockddi.cpp
+++ b/source/adapters/mock/ur_mockddi.cpp
@@ -10126,9 +10126,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
-    const size_t *
-        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
-    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
@@ -10156,17 +10153,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     ur_result_t result = UR_RESULT_SUCCESS;
 
     ur_enqueue_kernel_launch_custom_exp_params_t params = {
-        &hQueue,
-        &hKernel,
-        &workDim,
-        &pGlobalWorkOffset,
-        &pGlobalWorkSize,
-        &pLocalWorkSize,
-        &numPropsInLaunchPropList,
-        &launchPropList,
-        &numEventsInWaitList,
-        &phEventWaitList,
-        &phEvent};
+        &hQueue,          &hKernel,
+        &workDim,         &pGlobalWorkSize,
+        &pLocalWorkSize,  &numPropsInLaunchPropList,
+        &launchPropList,  &numEventsInWaitList,
+        &phEventWaitList, &phEvent};
 
     auto beforeCallback = reinterpret_cast<ur_mock_callback_t>(
         mock::getCallbacks().get_before_callback(
@@ -10185,10 +10176,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
         result = replaceCallback(&params);
     } else {
 
-        // optional output handle
-        if (phEvent) {
-            *phEvent = mock::createDummyHandle<ur_event_handle_t>();
-        }
         result = UR_RESULT_SUCCESS;
     }
 
diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp
index afd1411ae8..64489c39ac 100644
--- a/source/loader/layers/tracing/ur_trcddi.cpp
+++ b/source/loader/layers/tracing/ur_trcddi.cpp
@@ -8698,9 +8698,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
-    const size_t *
-        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
-    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
@@ -8733,17 +8730,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     }
 
     ur_enqueue_kernel_launch_custom_exp_params_t params = {
-        &hQueue,
-        &hKernel,
-        &workDim,
-        &pGlobalWorkOffset,
-        &pGlobalWorkSize,
-        &pLocalWorkSize,
-        &numPropsInLaunchPropList,
-        &launchPropList,
-        &numEventsInWaitList,
-        &phEventWaitList,
-        &phEvent};
+        &hQueue,          &hKernel,
+        &workDim,         &pGlobalWorkSize,
+        &pLocalWorkSize,  &numPropsInLaunchPropList,
+        &launchPropList,  &numEventsInWaitList,
+        &phEventWaitList, &phEvent};
     uint64_t instance =
         getContext()->notify_begin(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP,
                                    "urEnqueueKernelLaunchCustomExp", &params);
@@ -8752,9 +8743,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     logger.info("   ---> urEnqueueKernelLaunchCustomExp\n");
 
     ur_result_t result = pfnKernelLaunchCustomExp(
-        hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
-        pLocalWorkSize, numPropsInLaunchPropList, launchPropList,
-        numEventsInWaitList, phEventWaitList, phEvent);
+        hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize,
+        numPropsInLaunchPropList, launchPropList, numEventsInWaitList,
+        phEventWaitList, phEvent);
 
     getContext()->notify_end(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP,
                              "urEnqueueKernelLaunchCustomExp", &params, &result,
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index c2dcc7be6f..b3969de10f 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -9726,9 +9726,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
-    const size_t *
-        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
-    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
@@ -9769,10 +9766,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
             return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
         }
 
-        if (NULL == pGlobalWorkOffset) {
-            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
-        }
-
         if (NULL == pGlobalWorkSize) {
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
@@ -9801,9 +9794,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     }
 
     ur_result_t result = pfnKernelLaunchCustomExp(
-        hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
-        pLocalWorkSize, numPropsInLaunchPropList, launchPropList,
-        numEventsInWaitList, phEventWaitList, phEvent);
+        hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize,
+        numPropsInLaunchPropList, launchPropList, numEventsInWaitList,
+        phEventWaitList, phEvent);
 
     return result;
 }
diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp
index 602b8f1a82..86a6ad95a0 100644
--- a/source/loader/ur_ldrddi.cpp
+++ b/source/loader/ur_ldrddi.cpp
@@ -8866,9 +8866,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
-    const size_t *
-        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
-    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
@@ -8911,35 +8908,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     // convert loader handle to platform handle
     hKernel = reinterpret_cast<ur_kernel_object_t *>(hKernel)->handle;
 
-    // convert loader handles to platform handles
-    auto phEventWaitListLocal =
-        std::vector<ur_event_handle_t>(numEventsInWaitList);
-    for (size_t i = 0; i < numEventsInWaitList; ++i) {
-        phEventWaitListLocal[i] =
-            reinterpret_cast<ur_event_object_t *>(phEventWaitList[i])->handle;
-    }
-
     // forward to device-platform
-    result = pfnKernelLaunchCustomExp(
-        hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
-        pLocalWorkSize, numPropsInLaunchPropList, launchPropList,
-        numEventsInWaitList, phEventWaitListLocal.data(), phEvent);
-
-    // In the event of ERROR_ADAPTER_SPECIFIC we should still attempt to wrap any output handles below.
-    if (UR_RESULT_SUCCESS != result &&
-        UR_RESULT_ERROR_ADAPTER_SPECIFIC != result) {
-        return result;
-    }
-    try {
-        // convert platform handle to loader handle
-        if (nullptr != phEvent) {
-            *phEvent = reinterpret_cast<ur_event_handle_t>(
-                context->factories.ur_event_factory.getInstance(*phEvent,
-                                                                dditable));
-        }
-    } catch (std::bad_alloc &) {
-        result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
-    }
+    result = pfnKernelLaunchCustomExp(hQueue, hKernel, workDim, pGlobalWorkSize,
+                                      pLocalWorkSize, numPropsInLaunchPropList,
+                                      launchPropList, numEventsInWaitList,
+                                      phEventWaitList, phEvent);
 
     return result;
 }
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index b2e26a8b8b..3340363737 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -8992,7 +8992,6 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
 ///         + NULL == hQueue
 ///         + NULL == hKernel
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `NULL == pGlobalWorkOffset`
 ///         + `NULL == pGlobalWorkSize`
 ///         + `NULL == launchPropList`
 ///         + NULL == pGlobalWorkSize
@@ -9021,9 +9020,6 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
-    const size_t *
-        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
-    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
@@ -9054,10 +9050,10 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
         return UR_RESULT_ERROR_UNINITIALIZED;
     }
 
-    return pfnKernelLaunchCustomExp(
-        hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
-        pLocalWorkSize, numPropsInLaunchPropList, launchPropList,
-        numEventsInWaitList, phEventWaitList, phEvent);
+    return pfnKernelLaunchCustomExp(hQueue, hKernel, workDim, pGlobalWorkSize,
+                                    pLocalWorkSize, numPropsInLaunchPropList,
+                                    launchPropList, numEventsInWaitList,
+                                    phEventWaitList, phEvent);
 } catch (...) {
     return exceptionToResult(std::current_exception());
 }
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index 0b2e6a0f74..853d61472e 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -7625,7 +7625,6 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
 ///         + NULL == hQueue
 ///         + NULL == hKernel
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `NULL == pGlobalWorkOffset`
 ///         + `NULL == pGlobalWorkSize`
 ///         + `NULL == launchPropList`
 ///         + NULL == pGlobalWorkSize
@@ -7654,9 +7653,6 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
-    const size_t *
-        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
-    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
diff --git a/test/conformance/exp_launch_properties/launch_properties.cpp b/test/conformance/exp_launch_properties/launch_properties.cpp
index 23ba56ff4b..a54a44ecaf 100644
--- a/test/conformance/exp_launch_properties/launch_properties.cpp
+++ b/test/conformance/exp_launch_properties/launch_properties.cpp
@@ -95,8 +95,8 @@ TEST_P(urEnqueueKernelLaunchCustomTest, Success) {
     AddPodArg(val);
 
     ASSERT_SUCCESS(urEnqueueKernelLaunchCustomExp(
-        queue, kernel, n_dimensions, &global_offset, &global_size, nullptr, 1,
-        &props[0], 0, nullptr, nullptr));
+        queue, kernel, n_dimensions, &global_size, nullptr, 1, &props[0], 0,
+        nullptr, nullptr));
     ASSERT_SUCCESS(urQueueFinish(queue));
     ValidateBuffer(buffer, sizeof(val) * global_size, val);
 }

From 252b3822f0e81ca2fff3b786ee9b8294bd88cad6 Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Wed, 27 Nov 2024 15:33:49 +0100
Subject: [PATCH 25/36] fix event caching

L0 requires the adapter to use a variety of different events, depending
on the specific use case and configuration. Events are also unique for
devices.
And, because the adapter wants to avoid unnecessarily allocating new
events from the driver, this necessities an event caching solution that
can separate the different event type and device combinations.

When counter-based events were introduced the event caching was
not properly expanded to take that event type into consideration,
presumably with the assumption that normal and counter based events
will never coexist. Unfortunately that is not true for the current
adapter implementation.

This patch simplifies the event caching logic, ensuring that
each unique event type and device combination has its own
event cache.
---
 source/adapters/level_zero/context.cpp | 27 +++++++---
 source/adapters/level_zero/context.hpp | 72 +++++++++++++-------------
 2 files changed, 56 insertions(+), 43 deletions(-)

diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp
index 7c1c412ee4..4fd1db0933 100644
--- a/source/adapters/level_zero/context.cpp
+++ b/source/adapters/level_zero/context.cpp
@@ -565,18 +565,26 @@ ur_event_handle_t ur_context_handle_t_::getEventFromContextCache(
     bool HostVisible, bool WithProfiling, ur_device_handle_t Device,
     bool CounterBasedEventEnabled) {
   std::scoped_lock<ur_mutex> Lock(EventCacheMutex);
-  auto Cache = getEventCache(HostVisible, WithProfiling, Device);
-  if (Cache->empty())
+  auto Cache = getEventCache(HostVisible, WithProfiling, Device,
+                             CounterBasedEventEnabled);
+  if (Cache->empty()) {
+    logger::info("Cache empty (Host Visible: {}, Profiling: {}, Counter: {}, "
+                 "Device: {})",
+                 HostVisible, WithProfiling, CounterBasedEventEnabled, Device);
     return nullptr;
+  }
 
   auto It = Cache->begin();
   ur_event_handle_t Event = *It;
-  if (Event->CounterBasedEventsEnabled != CounterBasedEventEnabled) {
-    return nullptr;
-  }
   Cache->erase(It);
   // We have to reset event before using it.
   Event->reset();
+
+  logger::info("Using {} event (Host Visible: {}, Profiling: {}, Counter: {}, "
+               "Device: {}) from cache {}",
+               Event, Event->HostVisibleEvent, Event->isProfilingEnabled(),
+               Event->CounterBasedEventsEnabled, Device, Cache);
+
   return Event;
 }
 
@@ -588,8 +596,13 @@ void ur_context_handle_t_::addEventToContextCache(ur_event_handle_t Event) {
     Device = Event->UrQueue->Device;
   }
 
-  auto Cache = getEventCache(Event->isHostVisible(),
-                             Event->isProfilingEnabled(), Device);
+  auto Cache =
+      getEventCache(Event->isHostVisible(), Event->isProfilingEnabled(), Device,
+                    Event->CounterBasedEventsEnabled);
+  logger::info("Inserting {} event (Host Visible: {}, Profiling: {}, Counter: "
+               "{}, Device: {}) into cache {}",
+               Event, Event->HostVisibleEvent, Event->isProfilingEnabled(),
+               Event->CounterBasedEventsEnabled, Device, Cache);
   Cache->emplace_back(Event);
 }
 
diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp
index 0d3b2846e2..470c4c4f35 100644
--- a/source/adapters/level_zero/context.hpp
+++ b/source/adapters/level_zero/context.hpp
@@ -169,15 +169,6 @@ struct ur_context_handle_t_ : _ur_object {
   // holding the current pool usage counts.
   ur_mutex ZeEventPoolCacheMutex;
 
-  // Mutex to control operations on event caches.
-  ur_mutex EventCacheMutex;
-
-  // Caches for events.
-  using EventCache = std::vector<std::list<ur_event_handle_t>>;
-  EventCache EventCaches{4};
-  std::vector<std::unordered_map<ur_device_handle_t, size_t>>
-      EventCachesDeviceMap{4};
-
   // Initialize the PI context.
   ur_result_t initialize();
 
@@ -313,36 +304,45 @@ struct ur_context_handle_t_ : _ur_object {
   ze_context_handle_t getZeHandle() const;
 
 private:
+  enum EventFlags {
+    EVENT_FLAG_HOST_VISIBLE = UR_BIT(0),
+    EVENT_FLAG_WITH_PROFILING = UR_BIT(1),
+    EVENT_FLAG_COUNTER = UR_BIT(2),
+    EVENT_FLAG_DEVICE = UR_BIT(3), // if set, subsequent bits are device id
+    MAX_EVENT_FLAG_BITS =
+        4, // this is used as an offset for embedding device id
+  };
+
+  // Mutex to control operations on event caches.
+  ur_mutex EventCacheMutex;
+
+  // Caches for events.
+  using EventCache = std::list<ur_event_handle_t>;
+  std::vector<EventCache> EventCaches;
+
   // Get the cache of events for a provided scope and profiling mode.
-  auto getEventCache(bool HostVisible, bool WithProfiling,
-                     ur_device_handle_t Device) {
+  EventCache *getEventCache(bool HostVisible, bool WithProfiling,
+                            ur_device_handle_t Device, bool Counter) {
+
+    size_t index = 0;
     if (HostVisible) {
-      if (Device) {
-        auto EventCachesMap =
-            WithProfiling ? &EventCachesDeviceMap[0] : &EventCachesDeviceMap[1];
-        if (EventCachesMap->find(Device) == EventCachesMap->end()) {
-          EventCaches.emplace_back();
-          EventCachesMap->insert(
-              std::make_pair(Device, EventCaches.size() - 1));
-        }
-        return &EventCaches[(*EventCachesMap)[Device]];
-      } else {
-        return WithProfiling ? &EventCaches[0] : &EventCaches[1];
-      }
-    } else {
-      if (Device) {
-        auto EventCachesMap =
-            WithProfiling ? &EventCachesDeviceMap[2] : &EventCachesDeviceMap[3];
-        if (EventCachesMap->find(Device) == EventCachesMap->end()) {
-          EventCaches.emplace_back();
-          EventCachesMap->insert(
-              std::make_pair(Device, EventCaches.size() - 1));
-        }
-        return &EventCaches[(*EventCachesMap)[Device]];
-      } else {
-        return WithProfiling ? &EventCaches[2] : &EventCaches[3];
-      }
+      index |= EVENT_FLAG_HOST_VISIBLE;
+    }
+    if (WithProfiling) {
+      index |= EVENT_FLAG_WITH_PROFILING;
     }
+    if (Counter) {
+      index |= EVENT_FLAG_COUNTER;
+    }
+    if (Device) {
+      index |= EVENT_FLAG_DEVICE | (*Device->Id << MAX_EVENT_FLAG_BITS);
+    }
+
+    if (index >= EventCaches.size()) {
+      EventCaches.resize(index + 1);
+    }
+
+    return &EventCaches[index];
   }
 };
 

From 105d4e079b8a41147f5a908c1c3236ba2bfbb875 Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Thu, 21 Nov 2024 14:48:13 +0100
Subject: [PATCH 26/36] add 3 more benchmarks

This patch implements support for:
 - dl-cifar
 - dl-mnist
 - svm

These are all workloads from Velocity Bench that require oneMKL.
---
 scripts/benchmarks/benches/base.py     |  18 ++--
 scripts/benchmarks/benches/llamacpp.py |  75 +--------------
 scripts/benchmarks/benches/oneapi.py   |  86 +++++++++++++++++
 scripts/benchmarks/benches/options.py  |   1 +
 scripts/benchmarks/benches/velocity.py | 122 ++++++++++++++++++++++++-
 scripts/benchmarks/main.py             |   1 +
 scripts/benchmarks/utils/utils.py      |   9 +-
 7 files changed, 224 insertions(+), 88 deletions(-)
 create mode 100644 scripts/benchmarks/benches/oneapi.py

diff --git a/scripts/benchmarks/benches/base.py b/scripts/benchmarks/benches/base.py
index feeaa568b6..4356fb0d96 100644
--- a/scripts/benchmarks/benches/base.py
+++ b/scripts/benchmarks/benches/base.py
@@ -40,19 +40,21 @@ def run_bench(self, command, env_vars, ld_library=[]):
             ld_library=ld_library
         ).stdout.decode()
 
-    def create_data_path(self, name):
-        data_path = os.path.join(self.directory, "data", name)
-
-        if options.rebuild and Path(data_path).exists():
-           shutil.rmtree(data_path)
+    def create_data_path(self, name, skip_data_dir = False):
+        if skip_data_dir:
+            data_path = os.path.join(self.directory, name)
+        else:
+            data_path = os.path.join(self.directory, 'data', name)
+            if options.rebuild and Path(data_path).exists():
+                shutil.rmtree(data_path)
 
         Path(data_path).mkdir(parents=True, exist_ok=True)
 
         return data_path
 
-    def download(self, name, url, file, untar = False):
-        self.data_path = self.create_data_path(name)
-        return download(self.data_path, url, file, True)
+    def download(self, name, url, file, untar = False, unzip = False, skip_data_dir = False):
+        self.data_path = self.create_data_path(name, skip_data_dir)
+        return download(self.data_path, url, file, untar, unzip)
 
     def name(self):
         raise NotImplementedError()
diff --git a/scripts/benchmarks/benches/llamacpp.py b/scripts/benchmarks/benches/llamacpp.py
index 50dd8d04c6..4a260a09cc 100644
--- a/scripts/benchmarks/benches/llamacpp.py
+++ b/scripts/benchmarks/benches/llamacpp.py
@@ -6,85 +6,14 @@
 import csv
 import io
 from pathlib import Path
-import re
-import shutil
 from utils.utils import download, git_clone
 from .base import Benchmark, Suite
 from .result import Result
 from utils.utils import run, create_build_path
 from .options import options
+from .oneapi import get_oneapi
 import os
 
-class OneAPI:
-    # random unique number for benchmark oneAPI installation
-    ONEAPI_BENCHMARK_INSTANCE_ID = 98765
-    def __init__(self, directory):
-        self.oneapi_dir = os.path.join(directory, 'oneapi')
-        Path(self.oneapi_dir).mkdir(parents=True, exist_ok=True)
-        # delete if some option is set?
-
-        # can we just hardcode these links?
-        self.install_package('dnnl', 'https://registrationcenter-download.intel.com/akdlm/IRC_NAS/87e117ab-039b-437d-9c80-dcd5c9e675d5/intel-onednn-2025.0.0.862_offline.sh')
-        self.install_package('mkl', 'https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940_offline.sh')
-        return
-
-    def install_package(self, name, url):
-        package_path = os.path.join(self.oneapi_dir, name)
-        if Path(package_path).exists():
-            print(f"{package_path} exists, skipping installing oneAPI package {name}...")
-            return
-
-        package = download(self.oneapi_dir, url, f'package_{name}.sh')
-        try:
-            print(f"installing f{name}")
-            run(f"sh {package} -a -s --eula accept --install-dir {self.oneapi_dir} --instance f{self.ONEAPI_BENCHMARK_INSTANCE_ID}")
-        except:
-            print("oneAPI installation likely exists already")
-            return
-        print(f"f{name} installation complete")
-
-    def package_dir(self, package, dir):
-        return os.path.join(self.oneapi_dir, package, 'latest', dir)
-
-    def package_cmake(self, package):
-        package_lib = self.package_dir(package, 'lib')
-        return os.path.join(package_lib, 'cmake', package)
-
-    def mkl_lib(self):
-        return self.package_dir('mkl', 'lib')
-
-    def mkl_include(self):
-        return self.package_dir('mkl', 'include')
-
-    def mkl_cmake(self):
-        return self.package_cmake('mkl')
-
-    def dnn_lib(self):
-        return self.package_dir('dnnl', 'lib')
-
-    def dnn_include(self):
-        return self.package_dir('dnnl', 'include')
-
-    def dnn_cmake(self):
-        return self.package_cmake('dnnl')
-
-    def tbb_lib(self):
-        return self.package_dir('tbb', 'lib')
-
-    def tbb_cmake(self):
-        return self.package_cmake('tbb')
-
-    def compiler_lib(self):
-        return self.package_dir('compiler', 'lib')
-
-    def ld_libraries(self):
-        return [
-            self.compiler_lib(),
-            self.mkl_lib(),
-            self.tbb_lib(),
-            self.dnn_lib()
-        ]
-
 class LlamaCppBench(Suite):
     def __init__(self, directory):
         if options.sycl is None:
@@ -103,7 +32,7 @@ def setup(self):
 
         self.model = download(self.models_dir, "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf", "Phi-3-mini-4k-instruct-q4.gguf")
 
-        self.oneapi = OneAPI(self.directory)
+        self.oneapi = get_oneapi()
 
         self.build_path = create_build_path(self.directory, 'llamacpp-build')
 
diff --git a/scripts/benchmarks/benches/oneapi.py b/scripts/benchmarks/benches/oneapi.py
new file mode 100644
index 0000000000..414c4aa64a
--- /dev/null
+++ b/scripts/benchmarks/benches/oneapi.py
@@ -0,0 +1,86 @@
+# Copyright (C) 2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from pathlib import Path
+from utils.utils import download, run
+from .options import options
+import os
+
+class OneAPI:
+    # random unique number for benchmark oneAPI installation
+    ONEAPI_BENCHMARK_INSTANCE_ID = 98765
+    def __init__(self):
+        self.oneapi_dir = os.path.join(options.workdir, 'oneapi')
+        Path(self.oneapi_dir).mkdir(parents=True, exist_ok=True)
+        # delete if some option is set?
+
+        # can we just hardcode these links?
+        self.install_package('dnnl', 'https://registrationcenter-download.intel.com/akdlm/IRC_NAS/87e117ab-039b-437d-9c80-dcd5c9e675d5/intel-onednn-2025.0.0.862_offline.sh')
+        self.install_package('mkl', 'https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940_offline.sh')
+        return
+
+    def install_package(self, name, url):
+        package_path = os.path.join(self.oneapi_dir, name)
+        if Path(package_path).exists():
+            print(f"{package_path} exists, skipping installing oneAPI package {name}...")
+            return
+
+        package = download(self.oneapi_dir, url, f'package_{name}.sh')
+        try:
+            print(f"installing f{name}")
+            run(f"sh {package} -a -s --eula accept --install-dir {self.oneapi_dir} --instance f{self.ONEAPI_BENCHMARK_INSTANCE_ID}")
+        except:
+            print("oneAPI installation likely exists already")
+            return
+        print(f"f{name} installation complete")
+
+    def package_dir(self, package, dir):
+        return os.path.join(self.oneapi_dir, package, 'latest', dir)
+
+    def package_cmake(self, package):
+        package_lib = self.package_dir(package, 'lib')
+        return os.path.join(package_lib, 'cmake', package)
+
+    def mkl_lib(self):
+        return self.package_dir('mkl', 'lib')
+
+    def mkl_include(self):
+        return self.package_dir('mkl', 'include')
+
+    def mkl_cmake(self):
+        return self.package_cmake('mkl')
+
+    def dnn_lib(self):
+        return self.package_dir('dnnl', 'lib')
+
+    def dnn_include(self):
+        return self.package_dir('dnnl', 'include')
+
+    def dnn_cmake(self):
+        return self.package_cmake('dnnl')
+
+    def tbb_lib(self):
+        return self.package_dir('tbb', 'lib')
+
+    def tbb_cmake(self):
+        return self.package_cmake('tbb')
+
+    def compiler_lib(self):
+        return self.package_dir('compiler', 'lib')
+
+    def ld_libraries(self):
+        return [
+            self.compiler_lib(),
+            self.mkl_lib(),
+            self.tbb_lib(),
+            self.dnn_lib()
+        ]
+
+oneapi_instance = None
+
+def get_oneapi() -> OneAPI: # oneAPI singleton
+    if not hasattr(get_oneapi, "instance"):
+        get_oneapi.instance = OneAPI()
+    return get_oneapi.instance
diff --git a/scripts/benchmarks/benches/options.py b/scripts/benchmarks/benches/options.py
index 5997cdedb8..03b0db7128 100644
--- a/scripts/benchmarks/benches/options.py
+++ b/scripts/benchmarks/benches/options.py
@@ -8,6 +8,7 @@ class Compare(Enum):
 
 @dataclass
 class Options:
+    workdir: str = None
     sycl: str = None
     ur: str = None
     ur_adapter: str = None
diff --git a/scripts/benchmarks/benches/velocity.py b/scripts/benchmarks/benches/velocity.py
index 605cf03fd4..705421d963 100644
--- a/scripts/benchmarks/benches/velocity.py
+++ b/scripts/benchmarks/benches/velocity.py
@@ -10,6 +10,9 @@
 from .result import Result
 from utils.utils import run, create_build_path
 from .options import options
+from .oneapi import get_oneapi
+import shutil
+
 import os
 
 class VelocityBench(Suite):
@@ -35,7 +38,10 @@ def benchmarks(self) -> list[Benchmark]:
             CudaSift(self),
             Easywave(self),
             QuickSilver(self),
-            SobelFilter(self)
+            SobelFilter(self),
+            DLCifar(self),
+            DLMnist(self),
+            SVM(self)
         ]
 
 class VelocityBase(Benchmark):
@@ -50,6 +56,12 @@ def __init__(self, name: str, bin_name: str, vb: VelocityBench, unit: str):
     def download_deps(self):
         return
 
+    def extra_cmake_args(self) -> list[str]:
+        return []
+
+    def ld_libraries(self) -> list[str]:
+        return []
+
     def setup(self):
         self.download_deps()
         self.benchmark_bin = os.path.join(self.directory, self.bench_name, self.bin_name)
@@ -62,8 +74,10 @@ def setup(self):
             f"-S {self.code_path}",
             f"-DCMAKE_BUILD_TYPE=Release"
         ]
+        configure_command += self.extra_cmake_args()
+
         run(configure_command, {'CC': 'clang', 'CXX':'clang++'}, add_sycl=True)
-        run(f"cmake --build {build_path} -j", add_sycl=True)
+        run(f"cmake --build {build_path} -j", add_sycl=True, ld_library=self.ld_libraries())
 
     def bin_args(self) -> list[str]:
         return []
@@ -82,7 +96,7 @@ def run(self, env_vars) -> list[Result]:
         ]
         command += self.bin_args()
 
-        result = self.run_bench(command, env_vars)
+        result = self.run_bench(command, env_vars, ld_library=self.ld_libraries())
 
         return [ Result(label=self.name(), value=self.parse_output(result), command=command, env=env_vars, stdout=result, unit=self.unit) ]
 
@@ -136,7 +150,6 @@ def __init__(self, vb: VelocityBench):
 
     def download_deps(self):
         self.download("sobel_filter", "https://github.com/oneapi-src/Velocity-Bench/raw/main/sobel_filter/res/sobel_filter_data.tgz?download=", "sobel_filter_data.tgz", untar=True)
-        return
 
     def name(self):
         return "Velocity-Bench Sobel Filter"
@@ -228,7 +241,6 @@ def get_last_elapsed_time(self, log_file_path) -> float:
     def parse_output(self, stdout: str) -> float:
         return self.get_last_elapsed_time(os.path.join(options.benchmark_cwd, "easywave.log"))
 
-
 class CudaSift(VelocityBase):
     def __init__(self, vb: VelocityBench):
         super().__init__("cudaSift", "cudaSift", vb, "ms")
@@ -248,3 +260,103 @@ def parse_output(self, stdout: str) -> float:
             return float(match.group(1))
         else:
             raise ValueError("Failed to parse benchmark output.")
+
+class DLCifar(VelocityBase):
+    def __init__(self, vb: VelocityBench):
+        self.oneapi = get_oneapi()
+        super().__init__("dl-cifar", "dl-cifar_sycl", vb, "s")
+
+    def ld_libraries(self):
+        return self.oneapi.ld_libraries()
+
+    def download_deps(self):
+        # TODO: dl-cifar hardcodes the path to this dataset as "../../datasets/cifar-10-binary"...
+        self.download("datasets", "https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz", "cifar-10-binary.tar.gz", untar=True, skip_data_dir=True)
+        return
+
+    def extra_cmake_args(self):
+        return [
+            f"-DCMAKE_CXX_FLAGS=-O3 -fsycl -ffast-math -I{self.oneapi.dnn_include()} -I{self.oneapi.mkl_include()} -L{self.oneapi.dnn_lib()} -L{self.oneapi.mkl_lib()}"
+        ]
+
+    def name(self):
+        return "Velocity-Bench dl-cifar"
+
+    def parse_output(self, stdout: str) -> float:
+        match = re.search(r'dl-cifar - total time for whole calculation: (\d+\.\d+) s', stdout)
+        if match:
+            return float(match.group(1))
+        else:
+            raise ValueError("Failed to parse benchmark output.")
+
+class DLMnist(VelocityBase):
+    def __init__(self, vb: VelocityBench):
+        self.oneapi = get_oneapi()
+        super().__init__("dl-mnist", "dl-mnist-sycl", vb, "s")
+
+    def ld_libraries(self):
+        return self.oneapi.ld_libraries()
+
+    def download_deps(self):
+        # TODO: dl-mnist hardcodes the path to this dataset as "../../datasets/"...
+        self.download("datasets", "https://raw.githubusercontent.com/fgnt/mnist/master/train-images-idx3-ubyte.gz", "train-images.idx3-ubyte.gz", unzip=True, skip_data_dir=True)
+        self.download("datasets", "https://raw.githubusercontent.com/fgnt/mnist/master/train-labels-idx1-ubyte.gz", "train-labels.idx1-ubyte.gz", unzip=True, skip_data_dir=True)
+        self.download("datasets", "https://raw.githubusercontent.com/fgnt/mnist/master/t10k-images-idx3-ubyte.gz", "t10k-images.idx3-ubyte.gz", unzip=True, skip_data_dir=True)
+        self.download("datasets", "https://raw.githubusercontent.com/fgnt/mnist/master/t10k-labels-idx1-ubyte.gz", "t10k-labels.idx1-ubyte.gz", unzip=True, skip_data_dir=True)
+
+    def extra_cmake_args(self):
+        return [
+            f"-DCMAKE_CXX_FLAGS=-O3 -fsycl -ffast-math -I{self.oneapi.dnn_include()} -I{self.oneapi.mkl_include()} -L{self.oneapi.dnn_lib()} -L{self.oneapi.mkl_lib()}"
+        ]
+
+    def name(self):
+        return "Velocity-Bench dl-mnist"
+
+    def bin_args(self):
+        return [
+            "-conv_algo", "ONEDNN_AUTO"
+        ]
+
+    # TODO: This shouldn't be required.
+    # The application crashes with a segfault without it.
+    def extra_env_vars(self):
+        return {
+            "NEOReadDebugKeys":"1",
+            "DisableScratchPages":"0",
+        }
+
+    def parse_output(self, stdout: str) -> float:
+        match = re.search(r'dl-mnist - total time for whole calculation: (\d+\.\d+) s', stdout)
+        if match:
+            return float(match.group(1))
+        else:
+            raise ValueError("Failed to parse benchmark output.")
+
+class SVM(VelocityBase):
+    def __init__(self, vb: VelocityBench):
+        self.oneapi = get_oneapi()
+        super().__init__("svm", "svm_sycl", vb, "s")
+
+    def ld_libraries(self):
+        return self.oneapi.ld_libraries()
+
+    def extra_cmake_args(self):
+        return [
+            f"-DCMAKE_CXX_FLAGS=-O3 -fsycl -ffast-math -I{self.oneapi.dnn_include()} -I{self.oneapi.mkl_include()} -L{self.oneapi.dnn_lib()} -L{self.oneapi.mkl_lib()}"
+        ]
+
+    def name(self):
+        return "Velocity-Bench svm"
+
+    def bin_args(self):
+        return [
+            f"{self.code_path}/a9a",
+            f"{self.code_path}/a.m",
+        ]
+
+    def parse_output(self, stdout: str) -> float:
+        match = re.search(r'Total      elapsed time : (\d+\.\d+) s', stdout)
+        if match:
+            return float(match.group(1))
+        else:
+            raise ValueError("Failed to parse benchmark output.")
diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py
index c83825c9e5..bca0f01553 100755
--- a/scripts/benchmarks/main.py
+++ b/scripts/benchmarks/main.py
@@ -179,6 +179,7 @@ def validate_and_parse_env_args(env_args):
     args = parser.parse_args()
     additional_env_vars = validate_and_parse_env_args(args.env)
 
+    options.workdir = args.benchmark_directory
     options.verbose = args.verbose
     options.rebuild = not args.no_rebuild
     options.sycl = args.sycl
diff --git a/scripts/benchmarks/utils/utils.py b/scripts/benchmarks/utils/utils.py
index d077184e5c..0bb954fab2 100644
--- a/scripts/benchmarks/utils/utils.py
+++ b/scripts/benchmarks/utils/utils.py
@@ -3,6 +3,7 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+import gzip
 import os
 import shutil
 import subprocess
@@ -58,7 +59,7 @@ def git_clone(dir, name, repo, commit):
     return repo_path
 
 def prepare_bench_cwd(dir):
-    # we need 2 deep to workaround a problem with a fixed relative path in cudaSift
+    # we need 2 deep to workaround a problem with a fixed relative paths in some velocity benchmarks
     options.benchmark_cwd = os.path.join(dir, 'bcwd', 'bcwd')
     if os.path.exists(options.benchmark_cwd):
         shutil.rmtree(options.benchmark_cwd)
@@ -97,7 +98,7 @@ def create_build_path(directory, name):
 
     return build_path
 
-def download(dir, url, file, untar = False):
+def download(dir, url, file, untar = False, unzip = False):
     data_file = os.path.join(dir, file)
     if not Path(data_file).exists():
         print(f"{data_file} does not exist, downloading")
@@ -106,6 +107,10 @@ def download(dir, url, file, untar = False):
             file = tarfile.open(data_file)
             file.extractall(dir)
             file.close()
+        if unzip:
+            [stripped_gz, _] = os.path.splitext(data_file)
+            with gzip.open(data_file, 'rb') as f_in, open(stripped_gz, 'wb') as f_out:
+                shutil.copyfileobj(f_in, f_out)
     else:
         print(f"{data_file} exists, skipping...")
     return data_file

From 008c998d0d03063b53c5d078b4074e12fc7b138f Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Mon, 2 Dec 2024 12:16:10 +0000
Subject: [PATCH 27/36] Add sanitizer ignorelist for cfi

It seems that, at least if LLVM is to be beleived, this failure is due
to something in libc++. This change adds an ignorelist that ignores said
function.
---
 cmake/helpers.cmake                                | 14 ++++++++++----
 sanitizer-ignorelist.txt                           |  6 ++++++
 .../exp_command_buffer_adapter_cuda.match          | 11 -----------
 .../exp_command_buffer_adapter_hip.match           | 10 ----------
 4 files changed, 16 insertions(+), 25 deletions(-)
 create mode 100644 sanitizer-ignorelist.txt
 delete mode 100644 test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match
 delete mode 100644 test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match

diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake
index 8cc5353609..c0fd7ab90c 100644
--- a/cmake/helpers.cmake
+++ b/cmake/helpers.cmake
@@ -79,6 +79,13 @@ else()
     set(CXX_HAS_CFI_SANITIZE OFF)
 endif()
 
+set(CFI_FLAGS "")
+if (CFI_HAS_CFI_SANITIZE)
+    # cfi-icall requires called functions in shared libraries to also be built with cfi-icall, which we can't
+    # guarantee. -fsanitize=cfi depends on -flto
+    set(CFI_FLAGS "-flto -fsanitize=cfi -fno-sanitize=cfi-icall -fsanitize-ignorelist=${CMAKE_SOURCE_DIR}/sanitizer-ignorelist.txt")
+endif()
+
 function(add_ur_target_compile_options name)
     if(NOT MSVC)
         target_compile_definitions(${name} PRIVATE -D_FORTIFY_SOURCE=2)
@@ -95,9 +102,8 @@ function(add_ur_target_compile_options name)
             -fPIC
             -fstack-protector-strong
             -fvisibility=hidden
-            # cfi-icall requires called functions in shared libraries to also be built with cfi-icall, which we can't
-            # guarantee. -fsanitize=cfi depends on -flto
-            $<$<BOOL:${CXX_HAS_CFI_SANITIZE}>:-flto -fsanitize=cfi -fno-sanitize=cfi-icall>
+
+            ${CFI_FLAGS}
             $<$<BOOL:${CXX_HAS_FCF_PROTECTION_FULL}>:-fcf-protection=full>
             $<$<BOOL:${CXX_HAS_FSTACK_CLASH_PROTECTION}>:-fstack-clash-protection>
 
@@ -135,7 +141,7 @@ function(add_ur_target_link_options name)
     if(NOT MSVC)
         if (NOT APPLE)
             target_link_options(${name} PRIVATE
-                $<$<BOOL:${CXX_HAS_CFI_SANITIZE}>:-flto -fsanitize=cfi -fno-sanitize=cfi-icall>
+                ${CFI_FLAGS}
                 "LINKER:-z,relro,-z,now,-z,noexecstack"
             )
             if (UR_DEVELOPER_MODE)
diff --git a/sanitizer-ignorelist.txt b/sanitizer-ignorelist.txt
new file mode 100644
index 0000000000..85e8adc38d
--- /dev/null
+++ b/sanitizer-ignorelist.txt
@@ -0,0 +1,6 @@
+[cfi-unrelated-cast]
+# std::_Sp_counted_ptr_inplace::_Sp_counted_ptr_inplace() (libstdc++).
+# This ctor is used by std::make_shared and needs to cast to uninitialized T*
+# in order to call std::allocator_traits<T>::construct.
+# See: https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/cfi/cfi_ignorelist.txt
+fun:_ZNSt23_Sp_counted_ptr_inplace*
diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match
deleted file mode 100644
index 40182b9125..0000000000
--- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match
+++ /dev/null
@@ -1,11 +0,0 @@
-# Note: This file is only for use with cts_exe.py
-# These cause SIGILL when built with -fsanitize=cfi on Nvidia
-{{OPT}}urCommandBufferKernelHandleUpdateTest.Success/*
-{{OPT}}urCommandBufferKernelHandleUpdateTest.UpdateAgain/*
-{{OPT}}urCommandBufferKernelHandleUpdateTest.RestoreOriginalKernel/*
-{{OPT}}urCommandBufferKernelHandleUpdateTest.KernelAlternativeNotRegistered/*
-{{OPT}}urCommandBufferKernelHandleUpdateTest.RegisterInvalidKernelAlternative/*
-{{OPT}}urCommandBufferValidUpdateParametersTest.UpdateDimensionsWithoutUpdatingKernel/*
-{{OPT}}urCommandBufferValidUpdateParametersTest.UpdateOnlyLocalWorkSize/*
-{{OPT}}urCommandBufferValidUpdateParametersTest.SuccessNullptrHandle/*
-{{OPT}}KernelCommandEventSyncUpdateTest.TwoWaitEvents/*
diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match
deleted file mode 100644
index da8d6dee07..0000000000
--- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match
+++ /dev/null
@@ -1,10 +0,0 @@
-# Note: This file is only for use with cts_exe.py
-# These cause SIGILL when built with -fsanitize=cfi on AMD
-{{OPT}}urCommandBufferKernelHandleUpdateTest.Success/*
-{{OPT}}urCommandBufferKernelHandleUpdateTest.UpdateAgain/*
-{{OPT}}urCommandBufferKernelHandleUpdateTest.RestoreOriginalKernel/*
-{{OPT}}urCommandBufferKernelHandleUpdateTest.KernelAlternativeNotRegistered/*
-{{OPT}}urCommandBufferKernelHandleUpdateTest.RegisterInvalidKernelAlternative/*
-{{OPT}}urCommandBufferValidUpdateParametersTest.UpdateDimensionsWithoutUpdatingKernel/*
-{{OPT}}urCommandBufferValidUpdateParametersTest.UpdateOnlyLocalWorkSize/*
-{{OPT}}urCommandBufferValidUpdateParametersTest.SuccessNullptrHandle/*

From 79f3ccfccd80bdfe7b651f4d895277f15adb051c Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Fri, 8 Nov 2024 10:21:34 +0000
Subject: [PATCH 28/36] [CUDA][HIP] Fix for command-buffer local argument
 update

After setting kernel arguments during update, we
need to reset the amount of local memory used.
---
 source/adapters/cuda/command_buffer.cpp       |  22 +-
 .../device_code/saxpy_usm_local_mem.cpp       |  18 +-
 .../update/local_memory_update.cpp            | 505 ++++++++++++++----
 3 files changed, 431 insertions(+), 114 deletions(-)

diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp
index 527c339783..ec215c8c22 100644
--- a/source/adapters/cuda/command_buffer.cpp
+++ b/source/adapters/cuda/command_buffer.cpp
@@ -1396,14 +1396,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
 
   CUDA_KERNEL_NODE_PARAMS &Params = KernelCommandHandle->Params;
 
+  const auto LocalSize = KernelCommandHandle->Kernel->getLocalSize();
+  if (LocalSize != 0) {
+    // Clean the local size, otherwise calling updateKernelArguments() in
+    // future updates with local arguments will incorrectly increase the
+    // size further.
+    KernelCommandHandle->Kernel->clearLocalSize();
+  }
+
   Params.func = CuFunc;
-  Params.gridDimX = BlocksPerGrid[0];
-  Params.gridDimY = BlocksPerGrid[1];
-  Params.gridDimZ = BlocksPerGrid[2];
-  Params.blockDimX = ThreadsPerBlock[0];
-  Params.blockDimY = ThreadsPerBlock[1];
-  Params.blockDimZ = ThreadsPerBlock[2];
-  Params.sharedMemBytes = KernelCommandHandle->Kernel->getLocalSize();
+  Params.gridDimX = static_cast<unsigned int>(BlocksPerGrid[0]);
+  Params.gridDimY = static_cast<unsigned int>(BlocksPerGrid[1]);
+  Params.gridDimZ = static_cast<unsigned int>(BlocksPerGrid[2]);
+  Params.blockDimX = static_cast<unsigned int>(ThreadsPerBlock[0]);
+  Params.blockDimY = static_cast<unsigned int>(ThreadsPerBlock[1]);
+  Params.blockDimZ = static_cast<unsigned int>(ThreadsPerBlock[2]);
+  Params.sharedMemBytes = LocalSize;
   Params.kernelParams =
       const_cast<void **>(KernelCommandHandle->Kernel->getArgIndices().data());
 
diff --git a/test/conformance/device_code/saxpy_usm_local_mem.cpp b/test/conformance/device_code/saxpy_usm_local_mem.cpp
index 7ef17e59b5..c978caa27d 100644
--- a/test/conformance/device_code/saxpy_usm_local_mem.cpp
+++ b/test/conformance/device_code/saxpy_usm_local_mem.cpp
@@ -15,15 +15,27 @@ int main() {
     uint32_t A = 42;
 
     sycl_queue.submit([&](sycl::handler &cgh) {
-        sycl::local_accessor<uint32_t, 1> local_mem(local_size, cgh);
+        sycl::local_accessor<uint32_t, 1> local_mem_A(local_size, cgh);
+        sycl::local_accessor<uint32_t, 1> local_mem_B(1, cgh);
+
         cgh.parallel_for<class saxpy_usm_local_mem>(
             sycl::nd_range<1>{{array_size}, {local_size}},
             [=](sycl::nd_item<1> itemId) {
                 auto i = itemId.get_global_linear_id();
                 auto local_id = itemId.get_local_linear_id();
-                local_mem[local_id] = i;
-                Z[i] = A * X[i] + Y[i] + local_mem[local_id] +
+
+                local_mem_A[local_id] = i;
+                if (i == 0) {
+                    local_mem_B[0] = 0xA;
+                }
+
+                Z[i] = A * X[i] + Y[i] + local_mem_A[local_id] +
                        itemId.get_local_range(0);
+
+                if (i == 0) {
+                    Z[i] += local_mem_B[0];
+                }
+
             });
     });
     return 0;
diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
index c295556fdb..b9a9e1ba01 100644
--- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp
+++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
@@ -8,8 +8,7 @@
 #include <cstring>
 
 // Test that updating a command-buffer with a single kernel command
-// taking a local memory argument works correctly.
-
+// taking local memory arguments works correctly.
 struct LocalMemoryUpdateTestBase
     : uur::command_buffer::urUpdatableCommandBufferExpExecutionTest {
     virtual void SetUp() override {
@@ -38,11 +37,11 @@ struct LocalMemoryUpdateTestBase
             std::memcpy(shared_ptr, pattern.data(), allocation_size);
         }
         size_t current_index = 0;
-        // Index 0 is local_mem arg
+        // Index 0 is local_mem_a arg
         ASSERT_SUCCESS(urKernelSetArgLocal(kernel, current_index++,
-                                           local_mem_size, nullptr));
+                                           local_mem_a_size, nullptr));
 
-        //Hip has extr args for local mem at index 1-3
+        // Hip has extra args for local mem at index 1-3
         if (backend == UR_PLATFORM_BACKEND_HIP) {
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
                                                sizeof(local_size), nullptr,
@@ -55,16 +54,31 @@ struct LocalMemoryUpdateTestBase
                                                &local_size));
         }
 
-        // Index 1 is output
+        // Index 1 is local_mem_b arg
+        ASSERT_SUCCESS(urKernelSetArgLocal(kernel, current_index++,
+                                           local_mem_b_size, nullptr));
+        if (backend == UR_PLATFORM_BACKEND_HIP) {
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                               sizeof(local_size), nullptr,
+                                               &local_size));
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                               sizeof(local_size), nullptr,
+                                               &local_size));
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                               sizeof(local_size), nullptr,
+                                               &local_size));
+        }
+
+        // Index 2 is output
         ASSERT_SUCCESS(urKernelSetArgPointer(kernel, current_index++, nullptr,
                                              shared_ptrs[0]));
-        // Index 2 is A
+        // Index 3 is A
         ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, sizeof(A),
                                            nullptr, &A));
-        // Index 3 is X
+        // Index 4 is X
         ASSERT_SUCCESS(urKernelSetArgPointer(kernel, current_index++, nullptr,
                                              shared_ptrs[1]));
-        // Index 4 is Y
+        // Index 5 is Y
         ASSERT_SUCCESS(urKernelSetArgPointer(kernel, current_index++, nullptr,
                                              shared_ptrs[2]));
     }
@@ -73,6 +87,9 @@ struct LocalMemoryUpdateTestBase
                   size_t length, size_t local_size) {
         for (size_t i = 0; i < length; i++) {
             uint32_t result = A * X[i] + Y[i] + i + local_size;
+            if (i == 0) {
+                result += 0xA;
+            }
             ASSERT_EQ(result, output[i]);
         }
     }
@@ -89,7 +106,8 @@ struct LocalMemoryUpdateTestBase
     }
 
     static constexpr size_t local_size = 4;
-    static constexpr size_t local_mem_size = local_size * sizeof(uint32_t);
+    static constexpr size_t local_mem_a_size = local_size * sizeof(uint32_t);
+    static constexpr size_t local_mem_b_size = sizeof(uint32_t);
     static constexpr size_t global_size = 16;
     static constexpr size_t global_offset = 0;
     static constexpr size_t n_dimensions = 1;
@@ -127,7 +145,9 @@ struct LocalMemoryUpdateTest : LocalMemoryUpdateTestBase {
 
 UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(LocalMemoryUpdateTest);
 
-TEST_P(LocalMemoryUpdateTest, UpdateParameters) {
+// Test updating A,X,Y parameters to new values and local memory parameters
+// to original values.
+TEST_P(LocalMemoryUpdateTest, UpdateParametersSameLocalSize) {
     // Run command-buffer prior to update an verify output
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
                                              nullptr, nullptr));
@@ -139,63 +159,154 @@ TEST_P(LocalMemoryUpdateTest, UpdateParameters) {
     Validate(output, X, Y, A, global_size, local_size);
 
     // Update inputs
-    ur_exp_command_buffer_update_pointer_arg_desc_t new_input_descs[2];
-    ur_exp_command_buffer_update_value_arg_desc_t new_value_descs[2];
+    std::array<ur_exp_command_buffer_update_pointer_arg_desc_t, 2>
+        new_input_descs;
+    std::array<ur_exp_command_buffer_update_value_arg_desc_t, 3>
+        new_value_descs;
 
-    // New local_mem at index 0
+    // New local_mem_a at index 0
     new_value_descs[0] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
         nullptr,                                                    // pNext
         0,                                                          // argIndex
-        local_mem_size,                                             // argSize
+        local_mem_a_size,                                           // argSize
         nullptr, // pProperties
         nullptr, // hArgValue
     };
 
-    // New A at index 2
-    uint32_t new_A = 33;
+    // New local_mem_b at index 1
     new_value_descs[1] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
         nullptr,                                                    // pNext
-        2 + hip_arg_offset,                                         // argIndex
+        1 + hip_arg_offset,                                         // argIndex
+        local_mem_b_size,                                           // argSize
+        nullptr, // pProperties
+        nullptr, // hArgValue
+    };
+
+    // New A at index 3
+    uint32_t new_A = 33;
+    new_value_descs[2] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        3 + (2 * hip_arg_offset),                                   // argIndex
         sizeof(new_A),                                              // argSize
         nullptr, // pProperties
         &new_A,  // hArgValue
     };
 
-    // New X at index 3
+    // New X at index 4
     new_input_descs[0] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
         nullptr,                                                      // pNext
-        3 + hip_arg_offset, // argIndex
-        nullptr,            // pProperties
-        &shared_ptrs[3],    // pArgValue
+        4 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[3],          // pArgValue
+    };
+
+    // New Y at index 5
+    new_input_descs[1] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
+        nullptr,                                                      // pNext
+        5 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[4],          // pArgValue
+    };
+
+    // Update kernel inputs
+    ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
+        nullptr,                                                        // pNext
+        kernel,                 // hNewKernel
+        0,                      // numNewMemObjArgs
+        new_input_descs.size(), // numNewPointerArgs
+        new_value_descs.size(), // numNewValueArgs
+        n_dimensions,           // newWorkDim
+        nullptr,                // pNewMemObjArgList
+        new_input_descs.data(), // pNewPointerArgList
+        new_value_descs.data(), // pNewValueArgList
+        nullptr,                // pNewGlobalWorkOffset
+        nullptr,                // pNewGlobalWorkSize
+        nullptr,                // pNewLocalWorkSize
+    };
+
+    // Update kernel and enqueue command-buffer again
+    ASSERT_SUCCESS(
+        urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc));
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    // Verify that update occurred correctly
+    uint32_t *new_output = (uint32_t *)shared_ptrs[0];
+    uint32_t *new_X = (uint32_t *)shared_ptrs[3];
+    uint32_t *new_Y = (uint32_t *)shared_ptrs[4];
+    Validate(new_output, new_X, new_Y, new_A, global_size, local_size);
+}
+
+// Test updating A,X,Y parameters to new values and omitting local memory parameters
+// from the update.
+TEST_P(LocalMemoryUpdateTest, DISABLED_UpdateParametersEmptyLocalSize) {
+    // Run command-buffer prior to update an verify output
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    uint32_t *output = (uint32_t *)shared_ptrs[0];
+    uint32_t *X = (uint32_t *)shared_ptrs[1];
+    uint32_t *Y = (uint32_t *)shared_ptrs[2];
+    Validate(output, X, Y, A, global_size, local_size);
+
+    // Update inputs
+    std::array<ur_exp_command_buffer_update_pointer_arg_desc_t, 2>
+        new_input_descs;
+    std::array<ur_exp_command_buffer_update_value_arg_desc_t, 1>
+        new_value_descs;
+
+    // New A at index 3
+    uint32_t new_A = 33;
+    new_value_descs[0] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        3 + (2 * hip_arg_offset),                                   // argIndex
+        sizeof(new_A),                                              // argSize
+        nullptr, // pProperties
+        &new_A,  // hArgValue
     };
 
-    // New Y at index 4
+    // New X at index 4
     new_input_descs[1] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
         nullptr,                                                      // pNext
-        4 + hip_arg_offset, // argIndex
-        nullptr,            // pProperties
-        &shared_ptrs[4],    // pArgValue
+        4 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[3],          // pArgValue
+    };
+
+    // New Y at index 5
+    new_input_descs[2] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
+        nullptr,                                                      // pNext
+        5 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[4],          // pArgValue
     };
 
     // Update kernel inputs
     ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
         nullptr,                                                        // pNext
-        kernel,          // hNewKernel
-        0,               // numNewMemObjArgs
-        2,               // numNewPointerArgs
-        2,               // numNewValueArgs
-        n_dimensions,    // newWorkDim
-        nullptr,         // pNewMemObjArgList
-        new_input_descs, // pNewPointerArgList
-        new_value_descs, // pNewValueArgList
-        nullptr,         // pNewGlobalWorkOffset
-        nullptr,         // pNewGlobalWorkSize
-        nullptr,         // pNewLocalWorkSize
+        kernel,                 // hNewKernel
+        0,                      // numNewMemObjArgs
+        new_input_descs.size(), // numNewPointerArgs
+        new_value_descs.size(), // numNewValueArgs
+        n_dimensions,           // newWorkDim
+        nullptr,                // pNewMemObjArgList
+        new_input_descs.data(), // pNewPointerArgList
+        new_value_descs.data(), // pNewValueArgList
+        nullptr,                // pNewGlobalWorkOffset
+        nullptr,                // pNewGlobalWorkSize
+        nullptr,                // pNewLocalWorkSize
     };
 
     // Update kernel and enqueue command-buffer again
@@ -212,7 +323,9 @@ TEST_P(LocalMemoryUpdateTest, UpdateParameters) {
     Validate(new_output, new_X, new_Y, new_A, global_size, local_size);
 }
 
-TEST_P(LocalMemoryUpdateTest, UpdateParametersAndLocalSize) {
+// Test updating A,X,Y parameters to new values and local memory parameters
+// to new values.
+TEST_P(LocalMemoryUpdateTest, UpdateParametersDifferentLocalSize) {
     // Run command-buffer prior to update an verify output
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
                                              nullptr, nullptr));
@@ -230,7 +343,7 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersAndLocalSize) {
 
     size_t new_local_size = local_size * 2;
     size_t new_local_mem_size = new_local_size * sizeof(uint32_t);
-    // New local_mem at index 0
+    // New local_mem_a at index 0
     new_value_descs.push_back({
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
         nullptr,                                                    // pNext
@@ -267,33 +380,187 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersAndLocalSize) {
         });
     }
 
-    // New A at index 2
+    // New local_mem_b at index 1
+    new_value_descs.push_back({
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        1 + hip_arg_offset,                                         // argIndex
+        new_local_mem_size,                                         // argSize
+        nullptr, // pProperties
+        nullptr, // hArgValue
+    });
+
+    if (backend == UR_PLATFORM_BACKEND_HIP) {
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            5,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            6,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            7,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+    }
+
+    // New A at index 3
     uint32_t new_A = 33;
     new_value_descs.push_back({
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
         nullptr,                                                    // pNext
-        2 + hip_arg_offset,                                         // argIndex
+        3 + (2 * hip_arg_offset),                                   // argIndex
         sizeof(new_A),                                              // argSize
         nullptr, // pProperties
         &new_A,  // hArgValue
     });
 
-    // New X at index 3
+    // New X at index 4
     new_input_descs[0] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
         nullptr,                                                      // pNext
-        3 + hip_arg_offset, // argIndex
-        nullptr,            // pProperties
-        &shared_ptrs[3],    // pArgValue
+        4 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[3],          // pArgValue
     };
 
-    // New Y at index 4
+    // New Y at index 5
     new_input_descs[1] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
         nullptr,                                                      // pNext
-        4 + hip_arg_offset, // argIndex
-        nullptr,            // pProperties
-        &shared_ptrs[4],    // pArgValue
+        5 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[4],          // pArgValue
+    };
+
+    // Update kernel inputs
+    ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
+        nullptr,                                                        // pNext
+        kernel,                                        // hNewKernel
+        0,                                             // numNewMemObjArgs
+        2,                                             // numNewPointerArgs
+        static_cast<uint32_t>(new_value_descs.size()), // numNewValueArgs
+        n_dimensions,                                  // newWorkDim
+        nullptr,                                       // pNewMemObjArgList
+        new_input_descs,                               // pNewPointerArgList
+        new_value_descs.data(),                        // pNewValueArgList
+        nullptr,                                       // pNewGlobalWorkOffset
+        nullptr,                                       // pNewGlobalWorkSize
+        &new_local_size,                               // pNewLocalWorkSize
+    };
+
+    // Update kernel and enqueue command-buffer again
+    ASSERT_SUCCESS(
+        urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc));
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    // Verify that update occurred correctly
+    uint32_t *new_output = (uint32_t *)shared_ptrs[0];
+    uint32_t *new_X = (uint32_t *)shared_ptrs[3];
+    uint32_t *new_Y = (uint32_t *)shared_ptrs[4];
+    Validate(new_output, new_X, new_Y, new_A, global_size, new_local_size);
+}
+
+// Test updating A,X,Y parameters to new values and only one of the local memory
+// parameters, which is set to a new values.
+TEST_P(LocalMemoryUpdateTest, UpdateParametersPartialLocalSize) {
+    // Run command-buffer prior to update an verify output
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    uint32_t *output = (uint32_t *)shared_ptrs[0];
+    uint32_t *X = (uint32_t *)shared_ptrs[1];
+    uint32_t *Y = (uint32_t *)shared_ptrs[2];
+    Validate(output, X, Y, A, global_size, local_size);
+
+    // Update inputs
+    ur_exp_command_buffer_update_pointer_arg_desc_t new_input_descs[2];
+    std::vector<ur_exp_command_buffer_update_value_arg_desc_t>
+        new_value_descs{};
+
+    size_t new_local_size = local_size * 2;
+    size_t new_local_mem_size = new_local_size * sizeof(uint32_t);
+    // New local_mem_a at index 0
+    new_value_descs.push_back({
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        0,                                                          // argIndex
+        new_local_mem_size,                                         // argSize
+        nullptr, // pProperties
+        nullptr, // hArgValue
+    });
+
+    if (backend == UR_PLATFORM_BACKEND_HIP) {
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            1,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            2,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            3,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+    }
+
+    // New A at index 3
+    uint32_t new_A = 33;
+    new_value_descs.push_back({
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        3 + (2 * hip_arg_offset),                                   // argIndex
+        sizeof(new_A),                                              // argSize
+        nullptr, // pProperties
+        &new_A,  // hArgValue
+    });
+
+    // New X at index 4
+    new_input_descs[0] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
+        nullptr,                                                      // pNext
+        4 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[3],          // pArgValue
+    };
+
+    // New Y at index 5
+    new_input_descs[1] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
+        nullptr,                                                      // pNext
+        5 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[4],          // pArgValue
     };
 
     // Update kernel inputs
@@ -333,10 +600,12 @@ struct LocalMemoryMultiUpdateTest : LocalMemoryUpdateTestBase {
 
         // Append kernel command to command-buffer and close command-buffer
         for (unsigned node = 0; node < nodes; node++) {
-            // We need to set the local memory arg each time because it is
-            // cleared in the kernel handle after being used.
+#if 1 // TODO can we remove?                                                   \
+    // We need to set the local memory arg each time because it is             \
+    // cleared in the kernel handle after being used.
             ASSERT_SUCCESS(
-                urKernelSetArgLocal(kernel, 0, local_mem_size, nullptr));
+                urKernelSetArgLocal(kernel, 0, local_mem_a_size, nullptr));
+#endif
             ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp(
                 updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset,
                 &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr,
@@ -363,6 +632,8 @@ struct LocalMemoryMultiUpdateTest : LocalMemoryUpdateTestBase {
 
 UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(LocalMemoryMultiUpdateTest);
 
+// Test updating A,X,Y parameters to new values and local memory parameters
+// to original values.
 TEST_P(LocalMemoryMultiUpdateTest, UpdateParameters) {
     // Run command-buffer prior to update an verify output
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
@@ -375,63 +646,75 @@ TEST_P(LocalMemoryMultiUpdateTest, UpdateParameters) {
     Validate(output, X, Y, A, global_size, local_size);
 
     // Update inputs
-    ur_exp_command_buffer_update_pointer_arg_desc_t new_input_descs[2];
-    ur_exp_command_buffer_update_value_arg_desc_t new_value_descs[2];
+    std::array<ur_exp_command_buffer_update_pointer_arg_desc_t, 2>
+        new_input_descs;
+    std::array<ur_exp_command_buffer_update_value_arg_desc_t, 3>
+        new_value_descs;
 
-    // New local_mem at index 0
+    // New local_mem_a at index 0
     new_value_descs[0] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
         nullptr,                                                    // pNext
         0,                                                          // argIndex
-        local_mem_size,                                             // argSize
+        local_mem_a_size,                                           // argSize
         nullptr, // pProperties
         nullptr, // hArgValue
     };
 
-    // New A at index 2
-    uint32_t new_A = 33;
+    // New local_mem_b at index 1
     new_value_descs[1] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
         nullptr,                                                    // pNext
-        2 + hip_arg_offset,                                         // argIndex
+        1 + hip_arg_offset,                                         // argIndex
+        local_mem_b_size,                                           // argSize
+        nullptr, // pProperties
+        nullptr, // hArgValue
+    };
+
+    // New A at index 3
+    uint32_t new_A = 33;
+    new_value_descs[2] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        3 + (2 * hip_arg_offset),                                   // argIndex
         sizeof(new_A),                                              // argSize
         nullptr, // pProperties
         &new_A,  // hArgValue
     };
 
-    // New X at index 3
+    // New X at index 4
     new_input_descs[0] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
         nullptr,                                                      // pNext
-        3 + hip_arg_offset, // argIndex
-        nullptr,            // pProperties
-        &shared_ptrs[3],    // pArgValue
+        4 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[3],          // pArgValue
     };
 
-    // New Y at index 4
+    // New Y at index 5
     new_input_descs[1] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
         nullptr,                                                      // pNext
-        4 + hip_arg_offset, // argIndex
-        nullptr,            // pProperties
-        &shared_ptrs[4],    // pArgValue
+        5 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[4],          // pArgValue
     };
 
     // Update kernel inputs
     ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
         nullptr,                                                        // pNext
-        kernel,          // hNewKernel
-        0,               // numNewMemObjArgs
-        2,               // numNewPointerArgs
-        2,               // numNewValueArgs
-        n_dimensions,    // newWorkDim
-        nullptr,         // pNewMemObjArgList
-        new_input_descs, // pNewPointerArgList
-        new_value_descs, // pNewValueArgList
-        nullptr,         // pNewGlobalWorkOffset
-        nullptr,         // pNewGlobalWorkSize
-        nullptr,         // pNewLocalWorkSize
+        kernel,                 // hNewKernel
+        0,                      // numNewMemObjArgs
+        new_input_descs.size(), // numNewPointerArgs
+        new_value_descs.size(), // numNewValueArgs
+        n_dimensions,           // newWorkDim
+        nullptr,                // pNewMemObjArgList
+        new_input_descs.data(), // pNewPointerArgList
+        new_value_descs.data(), // pNewValueArgList
+        nullptr,                // pNewGlobalWorkOffset
+        nullptr,                // pNewGlobalWorkSize
+        nullptr,                // pNewLocalWorkSize
     };
 
     // Update kernel and enqueue command-buffer again
@@ -450,65 +733,79 @@ TEST_P(LocalMemoryMultiUpdateTest, UpdateParameters) {
     Validate(new_output, new_X, new_Y, new_A, global_size, local_size);
 }
 
+// Test updating A,X,Y parameters to new values and local memory parameters
+// to original values, but without doing a blocking wait.
 TEST_P(LocalMemoryMultiUpdateTest, UpdateWithoutBlocking) {
     // Update inputs
-    ur_exp_command_buffer_update_pointer_arg_desc_t new_input_descs[2];
-    ur_exp_command_buffer_update_value_arg_desc_t new_value_descs[2];
+    std::array<ur_exp_command_buffer_update_pointer_arg_desc_t, 2>
+        new_input_descs;
+    std::array<ur_exp_command_buffer_update_value_arg_desc_t, 3>
+        new_value_descs;
 
-    // New local_mem at index 0
+    // New local_mem_a at index 0
     new_value_descs[0] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
         nullptr,                                                    // pNext
         0,                                                          // argIndex
-        local_mem_size,                                             // argSize
+        local_mem_a_size,                                           // argSize
         nullptr, // pProperties
         nullptr, // hArgValue
     };
 
-    // New A at index 2
-    uint32_t new_A = 33;
+    // New local_mem_a at index 1
     new_value_descs[1] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
         nullptr,                                                    // pNext
-        2 + hip_arg_offset,                                         // argIndex
+        1 + hip_arg_offset,                                         // argIndex
+        local_mem_b_size,                                           // argSize
+        nullptr, // pProperties
+        nullptr, // hArgValue
+    };
+
+    // New A at index 3
+    uint32_t new_A = 33;
+    new_value_descs[2] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        3 + (2 * hip_arg_offset),                                   // argIndex
         sizeof(new_A),                                              // argSize
         nullptr, // pProperties
         &new_A,  // hArgValue
     };
 
-    // New X at index 3
+    // New X at index 4
     new_input_descs[0] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
         nullptr,                                                      // pNext
-        3 + hip_arg_offset, // argIndex
-        nullptr,            // pProperties
-        &shared_ptrs[3],    // pArgValue
+        4 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[3],          // pArgValue
     };
 
-    // New Y at index 4
+    // New Y at index 5
     new_input_descs[1] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
         nullptr,                                                      // pNext
-        4 + hip_arg_offset, // argIndex
-        nullptr,            // pProperties
-        &shared_ptrs[4],    // pArgValue
+        5 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[4],          // pArgValue
     };
 
     // Update kernel inputs
     ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
         nullptr,                                                        // pNext
-        kernel,          // hNewKernel
-        0,               // numNewMemObjArgs
-        2,               // numNewPointerArgs
-        2,               // numNewValueArgs
-        n_dimensions,    // newWorkDim
-        nullptr,         // pNewMemObjArgList
-        new_input_descs, // pNewPointerArgList
-        new_value_descs, // pNewValueArgList
-        nullptr,         // pNewGlobalWorkOffset
-        nullptr,         // pNewGlobalWorkSize
-        nullptr,         // pNewLocalWorkSize
+        kernel,                 // hNewKernel
+        0,                      // numNewMemObjArgs
+        new_input_descs.size(), // numNewPointerArgs
+        new_value_descs.size(), // numNewValueArgs
+        n_dimensions,           // newWorkDim
+        nullptr,                // pNewMemObjArgList
+        new_input_descs.data(), // pNewPointerArgList
+        new_value_descs.data(), // pNewValueArgList
+        nullptr,                // pNewGlobalWorkOffset
+        nullptr,                // pNewGlobalWorkSize
+        nullptr,                // pNewLocalWorkSize
     };
     // Enqueue without calling urQueueFinish after
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,

From 31dc7900375176ca92f55aefa6e216152ffe30a6 Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Thu, 14 Nov 2024 09:10:33 +0000
Subject: [PATCH 29/36] Improve solution

Iterate on previous solution so that the local argument
offsets at following inidices are updated when an earlier
local argument is updated
---
 source/adapters/cuda/command_buffer.cpp       |  25 +-
 source/adapters/cuda/enqueue.cpp              |   6 -
 source/adapters/cuda/kernel.hpp               |  98 +++++--
 source/adapters/hip/command_buffer.cpp        |   3 -
 source/adapters/hip/enqueue.cpp               |   2 -
 source/adapters/hip/kernel.hpp                | 100 +++++--
 .../device_code/saxpy_usm_local_mem.cpp       |  15 +-
 ...xp_command_buffer_adapter_native_cpu.match |   7 +-
 .../update/local_memory_update.cpp            | 258 ++++++++++++++++--
 9 files changed, 411 insertions(+), 103 deletions(-)

diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp
index ec215c8c22..4b4b2cffe5 100644
--- a/source/adapters/cuda/command_buffer.cpp
+++ b/source/adapters/cuda/command_buffer.cpp
@@ -522,9 +522,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
                                         DepsList.data(), DepsList.size(),
                                         &NodeParams));
 
-    if (LocalSize != 0)
-      hKernel->clearLocalSize();
-
     // Add signal node if external return event is used.
     CUgraphNode SignalNode = nullptr;
     if (phEvent) {
@@ -1396,22 +1393,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
 
   CUDA_KERNEL_NODE_PARAMS &Params = KernelCommandHandle->Params;
 
-  const auto LocalSize = KernelCommandHandle->Kernel->getLocalSize();
-  if (LocalSize != 0) {
-    // Clean the local size, otherwise calling updateKernelArguments() in
-    // future updates with local arguments will incorrectly increase the
-    // size further.
-    KernelCommandHandle->Kernel->clearLocalSize();
-  }
-
   Params.func = CuFunc;
-  Params.gridDimX = static_cast<unsigned int>(BlocksPerGrid[0]);
-  Params.gridDimY = static_cast<unsigned int>(BlocksPerGrid[1]);
-  Params.gridDimZ = static_cast<unsigned int>(BlocksPerGrid[2]);
-  Params.blockDimX = static_cast<unsigned int>(ThreadsPerBlock[0]);
-  Params.blockDimY = static_cast<unsigned int>(ThreadsPerBlock[1]);
-  Params.blockDimZ = static_cast<unsigned int>(ThreadsPerBlock[2]);
-  Params.sharedMemBytes = LocalSize;
+  Params.gridDimX = BlocksPerGrid[0];
+  Params.gridDimY = BlocksPerGrid[1];
+  Params.gridDimZ = BlocksPerGrid[2];
+  Params.blockDimX = ThreadsPerBlock[0];
+  Params.blockDimY = ThreadsPerBlock[1];
+  Params.blockDimZ = ThreadsPerBlock[2];
+  Params.sharedMemBytes = KernelCommandHandle->Kernel->getLocalSize();
   Params.kernelParams =
       const_cast<void **>(KernelCommandHandle->Kernel->getArgIndices().data());
 
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index fc3d0220e8..54a0f778fb 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -493,9 +493,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
         ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize,
         CuStream, const_cast<void **>(ArgIndices.data()), nullptr));
 
-    if (LocalSize != 0)
-      hKernel->clearLocalSize();
-
     if (phEvent) {
       UR_CHECK_ERROR(RetImplEvent->record());
       *phEvent = RetImplEvent.release();
@@ -673,9 +670,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
                                     const_cast<void **>(ArgIndices.data()),
                                     nullptr));
 
-    if (LocalSize != 0)
-      hKernel->clearLocalSize();
-
     if (phEvent) {
       UR_CHECK_ERROR(RetImplEvent->record());
       *phEvent = RetImplEvent.release();
diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp
index 7ad20a4f0e..31088da7cc 100644
--- a/source/adapters/cuda/kernel.hpp
+++ b/source/adapters/cuda/kernel.hpp
@@ -61,10 +61,22 @@ struct ur_kernel_handle_t_ {
     using args_t = std::array<char, MaxParamBytes>;
     using args_size_t = std::vector<size_t>;
     using args_index_t = std::vector<void *>;
+    /// Storage shared by all args which is mem copied into when adding a new
+    /// argument.
     args_t Storage;
+    /// Aligned size of each parameter, including padding.
     args_size_t ParamSizes;
+    /// Byte offset into /p Storage allocation for each parameter.
     args_index_t Indices;
-    args_size_t OffsetPerIndex;
+    /// Aligned size in bytes for each local memory parameter after padding has
+    /// been added. Zero if the argument at the index isn't a local memory
+    /// argument.
+    args_size_t AlignedLocalMemSize;
+    /// Original size in bytes for each local memory parameter, prior to being
+    /// padded to appropriate alignment. Zero if the argument at the index
+    /// isn't a local memory argument.
+    args_size_t OriginalLocalMemSize;
+
     // A struct to keep track of memargs so that we can do dependency analysis
     // at urEnqueueKernelLaunch
     struct mem_obj_arg {
@@ -93,7 +105,8 @@ struct ur_kernel_handle_t_ {
         Indices.resize(Index + 2, Indices.back());
         // Ensure enough space for the new argument
         ParamSizes.resize(Index + 1);
-        OffsetPerIndex.resize(Index + 1);
+        AlignedLocalMemSize.resize(Index + 1);
+        OriginalLocalMemSize.resize(Index + 1);
       }
       ParamSizes[Index] = Size;
       // calculate the insertion point on the array
@@ -102,28 +115,83 @@ struct ur_kernel_handle_t_ {
       // Update the stored value for the argument
       std::memcpy(&Storage[InsertPos], Arg, Size);
       Indices[Index] = &Storage[InsertPos];
-      OffsetPerIndex[Index] = LocalSize;
+      AlignedLocalMemSize[Index] = LocalSize;
     }
 
-    void addLocalArg(size_t Index, size_t Size) {
-      size_t LocalOffset = this->getLocalSize();
+    /// Returns the padded size and offset of a local memory argument.
+    /// Local memory arguments need to be padded if the alignment for the size
+    /// doesn't match the current offset into the kernel local data.
+    /// @param Index Kernel arg index.
+    /// @param Size User passed size of local parameter.
+    /// @return Tuple of (Aligned size, Aligned offset into local data).
+    std::pair<size_t, size_t> calcAlignedLocalArgument(size_t Index,
+                                                       size_t Size) {
+      // Store the unpadded size of the local argument
+      if (Index + 2 > Indices.size()) {
+        AlignedLocalMemSize.resize(Index + 1);
+        OriginalLocalMemSize.resize(Index + 1);
+      }
+      OriginalLocalMemSize[Index] = Size;
+
+      // Calculate the current starting offset into local data
+      const size_t LocalOffset = std::accumulate(
+          std::begin(AlignedLocalMemSize),
+          std::next(std::begin(AlignedLocalMemSize), Index), size_t{0});
 
-      // maximum required alignment is the size of the largest vector type
+      // Maximum required alignment is the size of the largest vector type
       const size_t MaxAlignment = sizeof(double) * 16;
 
-      // for arguments smaller than the maximum alignment simply align to the
+      // For arguments smaller than the maximum alignment simply align to the
       // size of the argument
       const size_t Alignment = std::min(MaxAlignment, Size);
 
-      // align the argument
+      // Align the argument
       size_t AlignedLocalOffset = LocalOffset;
-      size_t Pad = LocalOffset % Alignment;
+      const size_t Pad = LocalOffset % Alignment;
       if (Pad != 0) {
         AlignedLocalOffset += Alignment - Pad;
       }
 
+      const size_t AlignedLocalSize = Size + (AlignedLocalOffset - LocalOffset);
+      return std::make_pair(AlignedLocalSize, AlignedLocalOffset);
+    }
+
+    void addLocalArg(size_t Index, size_t Size) {
+      // Get the aligned argument size and offset into local data
+      size_t AlignedLocalSize, AlignedLocalOffset;
+      std::tie(AlignedLocalSize, AlignedLocalOffset) =
+          calcAlignedLocalArgument(Index, Size);
+
+      // Store argument details
       addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
-             Size + (AlignedLocalOffset - LocalOffset));
+             AlignedLocalSize);
+
+      // For every existing local argument which follows at later argument
+      // indices, updated the offset and pointer into the kernel local memory.
+      // Required as padding will need to be recalculated.
+      const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg
+      for (auto SuccIndex = Index + 1; SuccIndex < NumArgs; SuccIndex++) {
+        const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex];
+        if (OriginalLocalSize == 0) {
+          // Skip if successor argument isn't a local memory arg
+          continue;
+        }
+
+        // Recalculate alignment
+        size_t SuccAlignedLocalSize, SuccAlignedLocalOffset;
+        std::tie(SuccAlignedLocalSize, SuccAlignedLocalOffset) =
+            calcAlignedLocalArgument(SuccIndex, OriginalLocalSize);
+
+        // Store new local memory size
+        AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize;
+
+        // Store new offset into local data
+        const size_t InsertPos =
+            std::accumulate(std::begin(ParamSizes),
+                            std::begin(ParamSizes) + SuccIndex, size_t{0});
+        std::memcpy(&Storage[InsertPos], &SuccAlignedLocalOffset,
+                    sizeof(size_t));
+      }
     }
 
     void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) {
@@ -145,15 +213,11 @@ struct ur_kernel_handle_t_ {
       std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size);
     }
 
-    void clearLocalSize() {
-      std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0);
-    }
-
     const args_index_t &getIndices() const noexcept { return Indices; }
 
     uint32_t getLocalSize() const {
-      return std::accumulate(std::begin(OffsetPerIndex),
-                             std::end(OffsetPerIndex), 0);
+      return std::accumulate(std::begin(AlignedLocalMemSize),
+                             std::end(AlignedLocalMemSize), 0);
     }
   } Args;
 
@@ -240,7 +304,5 @@ struct ur_kernel_handle_t_ {
 
   uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); }
 
-  void clearLocalSize() { Args.clearLocalSize(); }
-
   size_t getRegsPerThread() const noexcept { return RegsPerThread; };
 };
diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp
index 9fed5db2f8..538c2ff85a 100644
--- a/source/adapters/hip/command_buffer.cpp
+++ b/source/adapters/hip/command_buffer.cpp
@@ -396,9 +396,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
                                          DepsList.data(), DepsList.size(),
                                          &NodeParams));
 
-    if (LocalSize != 0)
-      hKernel->clearLocalSize();
-
     // Get sync point and register the node with it.
     auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
     if (pSyncPoint) {
diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp
index 025a3f41f4..b9aa097848 100644
--- a/source/adapters/hip/enqueue.cpp
+++ b/source/adapters/hip/enqueue.cpp
@@ -324,8 +324,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
         ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2],
         hKernel->getLocalSize(), HIPStream, ArgIndices.data(), nullptr));
 
-    hKernel->clearLocalSize();
-
     if (phEvent) {
       UR_CHECK_ERROR(RetImplEvent->record());
       *phEvent = RetImplEvent.release();
diff --git a/source/adapters/hip/kernel.hpp b/source/adapters/hip/kernel.hpp
index afea69832b..08441e288f 100644
--- a/source/adapters/hip/kernel.hpp
+++ b/source/adapters/hip/kernel.hpp
@@ -56,10 +56,22 @@ struct ur_kernel_handle_t_ {
     using args_t = std::array<char, MAX_PARAM_BYTES>;
     using args_size_t = std::vector<size_t>;
     using args_index_t = std::vector<void *>;
+    /// Storage shared by all args which is mem copied into when adding a new
+    /// argument.
     args_t Storage;
+    /// Aligned size of each parameter, including padding.
     args_size_t ParamSizes;
+    /// Byte offset into /p Storage allocation for each parameter.
     args_index_t Indices;
-    args_size_t OffsetPerIndex;
+    /// Aligned size in bytes for each local memory parameter after padding has
+    /// been added. Zero if the argument at the index isn't a local memory
+    /// argument.
+    args_size_t AlignedLocalMemSize;
+    /// Original size in bytes for each local memory parameter, prior to being
+    /// padded to appropriate alignment. Zero if the argument at the index
+    /// isn't a local memory argument.
+    args_size_t OriginalLocalMemSize;
+
     // A struct to keep track of memargs so that we can do dependency analysis
     // at urEnqueueKernelLaunch
     struct mem_obj_arg {
@@ -88,7 +100,8 @@ struct ur_kernel_handle_t_ {
         Indices.resize(Index + 2, Indices.back());
         // Ensure enough space for the new argument
         ParamSizes.resize(Index + 1);
-        OffsetPerIndex.resize(Index + 1);
+        AlignedLocalMemSize.resize(Index + 1);
+        OriginalLocalMemSize.resize(Index + 1);
       }
       ParamSizes[Index] = Size;
       // calculate the insertion point on the array
@@ -97,28 +110,83 @@ struct ur_kernel_handle_t_ {
       // Update the stored value for the argument
       std::memcpy(&Storage[InsertPos], Arg, Size);
       Indices[Index] = &Storage[InsertPos];
-      OffsetPerIndex[Index] = LocalSize;
+      AlignedLocalMemSize[Index] = LocalSize;
     }
 
-    void addLocalArg(size_t Index, size_t Size) {
-      size_t LocalOffset = this->getLocalSize();
+    /// Returns the padded size and offset of a local memory argument.
+    /// Local memory arguments need to be padded if the alignment for the size
+    /// doesn't match the current offset into the kernel local data.
+    /// @param Index Kernel arg index.
+    /// @param Size User passed size of local parameter.
+    /// @return Tuple of (Aligned size, Aligned offset into local data).
+    std::pair<size_t, size_t> calcAlignedLocalArgument(size_t Index,
+                                                       size_t Size) {
+      // Store the unpadded size of the local argument
+      if (Index + 2 > Indices.size()) {
+        AlignedLocalMemSize.resize(Index + 1);
+        OriginalLocalMemSize.resize(Index + 1);
+      }
+      OriginalLocalMemSize[Index] = Size;
 
-      // maximum required alignment is the size of the largest vector type
+      // Calculate the current starting offset into local data
+      const size_t LocalOffset = std::accumulate(
+          std::begin(AlignedLocalMemSize),
+          std::next(std::begin(AlignedLocalMemSize), Index), size_t{0});
+
+      // Maximum required alignment is the size of the largest vector type
       const size_t MaxAlignment = sizeof(double) * 16;
 
-      // for arguments smaller than the maximum alignment simply align to the
+      // For arguments smaller than the maximum alignment simply align to the
       // size of the argument
       const size_t Alignment = std::min(MaxAlignment, Size);
 
-      // align the argument
+      // Align the argument
       size_t AlignedLocalOffset = LocalOffset;
-      size_t Pad = LocalOffset % Alignment;
+      const size_t Pad = LocalOffset % Alignment;
       if (Pad != 0) {
         AlignedLocalOffset += Alignment - Pad;
       }
 
-      addArg(Index, sizeof(size_t), (const void *)&AlignedLocalOffset,
-             Size + AlignedLocalOffset - LocalOffset);
+      const size_t AlignedLocalSize = Size + (AlignedLocalOffset - LocalOffset);
+      return std::make_pair(AlignedLocalSize, AlignedLocalOffset);
+    }
+
+    void addLocalArg(size_t Index, size_t Size) {
+      // Get the aligned argument size and offset into local data
+      size_t AlignedLocalSize, AlignedLocalOffset;
+      std::tie(AlignedLocalSize, AlignedLocalOffset) =
+          calcAlignedLocalArgument(Index, Size);
+
+      // Store argument details
+      addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
+             AlignedLocalSize);
+
+      // For every existing local argument which follows at later argument
+      // indices, updated the offset and pointer into the kernel local memory.
+      // Required as padding will need to be recalculated.
+      const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg
+      for (auto SuccIndex = Index + 1; SuccIndex < NumArgs; SuccIndex++) {
+        const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex];
+        if (OriginalLocalSize == 0) {
+          // Skip if successor argument isn't a local memory arg
+          continue;
+        }
+
+        // Recalculate alignment
+        size_t SuccAlignedLocalSize, SuccAlignedLocalOffset;
+        std::tie(SuccAlignedLocalSize, SuccAlignedLocalOffset) =
+            calcAlignedLocalArgument(SuccIndex, OriginalLocalSize);
+
+        // Store new local memory size
+        AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize;
+
+        // Store new offset into local data
+        const size_t InsertPos =
+            std::accumulate(std::begin(ParamSizes),
+                            std::begin(ParamSizes) + SuccIndex, size_t{0});
+        std::memcpy(&Storage[InsertPos], &SuccAlignedLocalOffset,
+                    sizeof(size_t));
+      }
     }
 
     void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) {
@@ -140,15 +208,11 @@ struct ur_kernel_handle_t_ {
       std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size);
     }
 
-    void clearLocalSize() {
-      std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0);
-    }
-
     const args_index_t &getIndices() const noexcept { return Indices; }
 
     uint32_t getLocalSize() const {
-      return std::accumulate(std::begin(OffsetPerIndex),
-                             std::end(OffsetPerIndex), 0);
+      return std::accumulate(std::begin(AlignedLocalMemSize),
+                             std::end(AlignedLocalMemSize), 0);
     }
   } Args;
 
@@ -220,6 +284,4 @@ struct ur_kernel_handle_t_ {
   }
 
   uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); }
-
-  void clearLocalSize() { Args.clearLocalSize(); }
 };
diff --git a/test/conformance/device_code/saxpy_usm_local_mem.cpp b/test/conformance/device_code/saxpy_usm_local_mem.cpp
index c978caa27d..c2bc3adc5e 100644
--- a/test/conformance/device_code/saxpy_usm_local_mem.cpp
+++ b/test/conformance/device_code/saxpy_usm_local_mem.cpp
@@ -16,7 +16,7 @@ int main() {
 
     sycl_queue.submit([&](sycl::handler &cgh) {
         sycl::local_accessor<uint32_t, 1> local_mem_A(local_size, cgh);
-        sycl::local_accessor<uint32_t, 1> local_mem_B(1, cgh);
+        sycl::local_accessor<uint32_t, 1> local_mem_B(local_size * 2, cgh);
 
         cgh.parallel_for<class saxpy_usm_local_mem>(
             sycl::nd_range<1>{{array_size}, {local_size}},
@@ -25,17 +25,12 @@ int main() {
                 auto local_id = itemId.get_local_linear_id();
 
                 local_mem_A[local_id] = i;
-                if (i == 0) {
-                    local_mem_B[0] = 0xA;
-                }
+                local_mem_B[local_id * 2] = -i;
+                local_mem_B[(local_id * 2) + 1] = itemId.get_local_range(0);
 
                 Z[i] = A * X[i] + Y[i] + local_mem_A[local_id] +
-                       itemId.get_local_range(0);
-
-                if (i == 0) {
-                    Z[i] += local_mem_B[0];
-                }
-
+                       local_mem_B[local_id * 2] +
+                       local_mem_B[(local_id * 2) + 1];
             });
     });
     return 0;
diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match
index c6fe7ad962..e0543e81ed 100644
--- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match
+++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match
@@ -37,7 +37,10 @@
 {{OPT}}KernelCommandEventSyncUpdateTest.TwoWaitEvents/*
 {{OPT}}KernelCommandEventSyncUpdateTest.InvalidWaitUpdate/*
 {{OPT}}KernelCommandEventSyncUpdateTest.InvalidSignalUpdate/*
-{{OPT}}LocalMemoryUpdateTest.UpdateParameters/*
-{{OPT}}LocalMemoryUpdateTest.UpdateParametersAndLocalSize/*
+{{OPT}}LocalMemoryUpdateTest.UpdateParametersSameLocalSize/*
+{{OPT}}LocalMemoryUpdateTest.UpdateParametersEmptyLocalSize/*
+{{OPT}}LocalMemoryUpdateTest.UpdateParametersSmallerLocalSize/*
+{{OPT}}LocalMemoryUpdateTest.UpdateParametersLargerLocalSize/*
+{{OPT}}LocalMemoryUpdateTest.UpdateParametersPartialLocalSize/*
 {{OPT}}LocalMemoryMultiUpdateTest.UpdateParameters/*
 {{OPT}}LocalMemoryMultiUpdateTest.UpdateWithoutBlocking/*
diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
index b9a9e1ba01..273f770303 100644
--- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp
+++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
@@ -86,10 +86,7 @@ struct LocalMemoryUpdateTestBase
     void Validate(uint32_t *output, uint32_t *X, uint32_t *Y, uint32_t A,
                   size_t length, size_t local_size) {
         for (size_t i = 0; i < length; i++) {
-            uint32_t result = A * X[i] + Y[i] + i + local_size;
-            if (i == 0) {
-                result += 0xA;
-            }
+            uint32_t result = A * X[i] + Y[i] + local_size;
             ASSERT_EQ(result, output[i]);
         }
     }
@@ -107,7 +104,7 @@ struct LocalMemoryUpdateTestBase
 
     static constexpr size_t local_size = 4;
     static constexpr size_t local_mem_a_size = local_size * sizeof(uint32_t);
-    static constexpr size_t local_mem_b_size = sizeof(uint32_t);
+    static constexpr size_t local_mem_b_size = local_mem_a_size * 2;
     static constexpr size_t global_size = 16;
     static constexpr size_t global_offset = 0;
     static constexpr size_t n_dimensions = 1;
@@ -246,7 +243,7 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersSameLocalSize) {
 
 // Test updating A,X,Y parameters to new values and omitting local memory parameters
 // from the update.
-TEST_P(LocalMemoryUpdateTest, DISABLED_UpdateParametersEmptyLocalSize) {
+TEST_P(LocalMemoryUpdateTest, UpdateParametersEmptyLocalSize) {
     // Run command-buffer prior to update an verify output
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
                                              nullptr, nullptr));
@@ -275,7 +272,7 @@ TEST_P(LocalMemoryUpdateTest, DISABLED_UpdateParametersEmptyLocalSize) {
     };
 
     // New X at index 4
-    new_input_descs[1] = {
+    new_input_descs[0] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
         nullptr,                                                      // pNext
         4 + (2 * hip_arg_offset), // argIndex
@@ -284,7 +281,7 @@ TEST_P(LocalMemoryUpdateTest, DISABLED_UpdateParametersEmptyLocalSize) {
     };
 
     // New Y at index 5
-    new_input_descs[2] = {
+    new_input_descs[1] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
         nullptr,                                                      // pNext
         5 + (2 * hip_arg_offset), // argIndex
@@ -324,8 +321,163 @@ TEST_P(LocalMemoryUpdateTest, DISABLED_UpdateParametersEmptyLocalSize) {
 }
 
 // Test updating A,X,Y parameters to new values and local memory parameters
-// to new values.
-TEST_P(LocalMemoryUpdateTest, UpdateParametersDifferentLocalSize) {
+// to new smaller values.
+TEST_P(LocalMemoryUpdateTest, UpdateParametersSmallerLocalSize) {
+    // Run command-buffer prior to update an verify output
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    uint32_t *output = (uint32_t *)shared_ptrs[0];
+    uint32_t *X = (uint32_t *)shared_ptrs[1];
+    uint32_t *Y = (uint32_t *)shared_ptrs[2];
+    Validate(output, X, Y, A, global_size, local_size);
+
+    // Update inputs
+    ur_exp_command_buffer_update_pointer_arg_desc_t new_input_descs[2];
+    std::vector<ur_exp_command_buffer_update_value_arg_desc_t>
+        new_value_descs{};
+
+    size_t new_local_size = 2;
+    size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t);
+    // New local_mem_a at index 0
+    new_value_descs.push_back({
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        0,                                                          // argIndex
+        new_local_mem_a_size,                                       // argSize
+        nullptr, // pProperties
+        nullptr, // hArgValue
+    });
+
+    if (backend == UR_PLATFORM_BACKEND_HIP) {
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            1,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            2,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            3,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+    }
+
+    // New local_mem_b at index 1
+    size_t new_local_mem_b_size = new_local_size * sizeof(uint32_t) * 2;
+    new_value_descs.push_back({
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        1 + hip_arg_offset,                                         // argIndex
+        new_local_mem_b_size,                                       // argSize
+        nullptr, // pProperties
+        nullptr, // hArgValue
+    });
+
+    if (backend == UR_PLATFORM_BACKEND_HIP) {
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            5,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            6,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            7,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+    }
+
+    // New A at index 3
+    uint32_t new_A = 33;
+    new_value_descs.push_back({
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        3 + (2 * hip_arg_offset),                                   // argIndex
+        sizeof(new_A),                                              // argSize
+        nullptr, // pProperties
+        &new_A,  // hArgValue
+    });
+
+    // New X at index 4
+    new_input_descs[0] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
+        nullptr,                                                      // pNext
+        4 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[3],          // pArgValue
+    };
+
+    // New Y at index 5
+    new_input_descs[1] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
+        nullptr,                                                      // pNext
+        5 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[4],          // pArgValue
+    };
+
+    // Update kernel inputs
+    ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
+        nullptr,                                                        // pNext
+        kernel,                                        // hNewKernel
+        0,                                             // numNewMemObjArgs
+        2,                                             // numNewPointerArgs
+        static_cast<uint32_t>(new_value_descs.size()), // numNewValueArgs
+        n_dimensions,                                  // newWorkDim
+        nullptr,                                       // pNewMemObjArgList
+        new_input_descs,                               // pNewPointerArgList
+        new_value_descs.data(),                        // pNewValueArgList
+        nullptr,                                       // pNewGlobalWorkOffset
+        nullptr,                                       // pNewGlobalWorkSize
+        &new_local_size,                               // pNewLocalWorkSize
+    };
+
+    // Update kernel and enqueue command-buffer again
+    ASSERT_SUCCESS(
+        urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc));
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    // Verify that update occurred correctly
+    uint32_t *new_output = (uint32_t *)shared_ptrs[0];
+    uint32_t *new_X = (uint32_t *)shared_ptrs[3];
+    uint32_t *new_Y = (uint32_t *)shared_ptrs[4];
+    Validate(new_output, new_X, new_Y, new_A, global_size, new_local_size);
+}
+
+// Test updating A,X,Y parameters to new values and local memory parameters
+// to new larger values.
+TEST_P(LocalMemoryUpdateTest, UpdateParametersLargerLocalSize) {
     // Run command-buffer prior to update an verify output
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
                                              nullptr, nullptr));
@@ -341,14 +493,14 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersDifferentLocalSize) {
     std::vector<ur_exp_command_buffer_update_value_arg_desc_t>
         new_value_descs{};
 
-    size_t new_local_size = local_size * 2;
-    size_t new_local_mem_size = new_local_size * sizeof(uint32_t);
+    size_t new_local_size = local_size * 4;
+    size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t);
     // New local_mem_a at index 0
     new_value_descs.push_back({
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
         nullptr,                                                    // pNext
         0,                                                          // argIndex
-        new_local_mem_size,                                         // argSize
+        new_local_mem_a_size,                                       // argSize
         nullptr, // pProperties
         nullptr, // hArgValue
     });
@@ -381,11 +533,12 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersDifferentLocalSize) {
     }
 
     // New local_mem_b at index 1
+    size_t new_local_mem_b_size = new_local_size * sizeof(uint32_t) * 2;
     new_value_descs.push_back({
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
         nullptr,                                                    // pNext
         1 + hip_arg_offset,                                         // argIndex
-        new_local_mem_size,                                         // argSize
+        new_local_mem_b_size,                                       // argSize
         nullptr, // pProperties
         nullptr, // hArgValue
     });
@@ -478,7 +631,8 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersDifferentLocalSize) {
 }
 
 // Test updating A,X,Y parameters to new values and only one of the local memory
-// parameters, which is set to a new values.
+// parameters, which is set to a new values. Then a separate update call for
+// the other local memory argument.
 TEST_P(LocalMemoryUpdateTest, UpdateParametersPartialLocalSize) {
     // Run command-buffer prior to update an verify output
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
@@ -495,14 +649,14 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersPartialLocalSize) {
     std::vector<ur_exp_command_buffer_update_value_arg_desc_t>
         new_value_descs{};
 
-    size_t new_local_size = local_size * 2;
-    size_t new_local_mem_size = new_local_size * sizeof(uint32_t);
+    size_t new_local_size = local_size * 4;
+    size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t);
     // New local_mem_a at index 0
     new_value_descs.push_back({
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
         nullptr,                                                    // pNext
         0,                                                          // argIndex
-        new_local_mem_size,                                         // argSize
+        new_local_mem_a_size,                                       // argSize
         nullptr, // pProperties
         nullptr, // hArgValue
     });
@@ -583,6 +737,67 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersPartialLocalSize) {
     // Update kernel and enqueue command-buffer again
     ASSERT_SUCCESS(
         urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc));
+
+    std::vector<ur_exp_command_buffer_update_value_arg_desc_t>
+        second_update_value_args{};
+
+    size_t new_local_mem_b_size = new_local_size * sizeof(uint32_t) * 2;
+    // New local_mem_b at index 1
+    second_update_value_args.push_back({
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        1 + hip_arg_offset,                                         // argIndex
+        new_local_mem_b_size,                                       // argSize
+        nullptr, // pProperties
+        nullptr, // hArgValue
+    });
+
+    if (backend == UR_PLATFORM_BACKEND_HIP) {
+        second_update_value_args.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            5,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+        second_update_value_args.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            6,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+        second_update_value_args.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            7,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+    }
+
+    ur_exp_command_buffer_update_kernel_launch_desc_t second_update_desc = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
+        nullptr,                                                        // pNext
+        kernel, // hNewKernel
+        0,      // numNewMemObjArgs
+        0,      // numNewPointerArgs
+        static_cast<uint32_t>(
+            second_update_value_args.size()), // numNewValueArgs
+        n_dimensions,                         // newWorkDim
+        nullptr,                              // pNewMemObjArgList
+        nullptr,                              // pNewPointerArgList
+        second_update_value_args.data(),      // pNewValueArgList
+        nullptr,                              // pNewGlobalWorkOffset
+        nullptr,                              // pNewGlobalWorkSize
+        nullptr,                              // pNewLocalWorkSize
+    };
+    ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp(command_handle,
+                                                        &second_update_desc));
+
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
                                              nullptr, nullptr));
     ASSERT_SUCCESS(urQueueFinish(queue));
@@ -597,15 +812,8 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersPartialLocalSize) {
 struct LocalMemoryMultiUpdateTest : LocalMemoryUpdateTestBase {
     void SetUp() override {
         UUR_RETURN_ON_FATAL_FAILURE(LocalMemoryUpdateTestBase::SetUp());
-
         // Append kernel command to command-buffer and close command-buffer
         for (unsigned node = 0; node < nodes; node++) {
-#if 1 // TODO can we remove?                                                   \
-    // We need to set the local memory arg each time because it is             \
-    // cleared in the kernel handle after being used.
-            ASSERT_SUCCESS(
-                urKernelSetArgLocal(kernel, 0, local_mem_a_size, nullptr));
-#endif
             ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp(
                 updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset,
                 &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr,

From 0b5bc826d1f128bb3f57892f60ecaddcf4f358ea Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Fri, 15 Nov 2024 11:22:41 +0000
Subject: [PATCH 30/36] Add extra more basic CTS test

---
 ...xp_command_buffer_adapter_native_cpu.match |  1 +
 .../update/local_memory_update.cpp            | 69 +++++++++++++++++++
 2 files changed, 70 insertions(+)

diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match
index e0543e81ed..3588eaea82 100644
--- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match
+++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match
@@ -38,6 +38,7 @@
 {{OPT}}KernelCommandEventSyncUpdateTest.InvalidWaitUpdate/*
 {{OPT}}KernelCommandEventSyncUpdateTest.InvalidSignalUpdate/*
 {{OPT}}LocalMemoryUpdateTest.UpdateParametersSameLocalSize/*
+{{OPT}}LocalMemoryUpdateTest.UpdateLocalOnly/*
 {{OPT}}LocalMemoryUpdateTest.UpdateParametersEmptyLocalSize/*
 {{OPT}}LocalMemoryUpdateTest.UpdateParametersSmallerLocalSize/*
 {{OPT}}LocalMemoryUpdateTest.UpdateParametersLargerLocalSize/*
diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
index 273f770303..2d0f24cb4e 100644
--- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp
+++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
@@ -16,6 +16,11 @@ struct LocalMemoryUpdateTestBase
         UUR_RETURN_ON_FATAL_FAILURE(
             urUpdatableCommandBufferExpExecutionTest::SetUp());
 
+        if (backend == UR_PLATFORM_BACKEND_LEVEL_ZERO) {
+            GTEST_SKIP()
+                << "Local memory argument update not supported on Level Zero.";
+        }
+
         // HIP has extra args for local memory so we define an offset for arg indices here for updating
         hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0;
         ur_device_usm_access_capability_flags_t shared_usm_flags;
@@ -241,6 +246,70 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersSameLocalSize) {
     Validate(new_output, new_X, new_Y, new_A, global_size, local_size);
 }
 
+// Test only passing local memory parameters to update with the original values.
+TEST_P(LocalMemoryUpdateTest, UpdateLocalOnly) {
+    // Run command-buffer prior to update an verify output
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    uint32_t *output = (uint32_t *)shared_ptrs[0];
+    uint32_t *X = (uint32_t *)shared_ptrs[1];
+    uint32_t *Y = (uint32_t *)shared_ptrs[2];
+    Validate(output, X, Y, A, global_size, local_size);
+
+    // Update inputs
+    std::array<ur_exp_command_buffer_update_value_arg_desc_t, 2>
+        new_value_descs;
+
+    // New local_mem_a at index 0
+    new_value_descs[0] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        0,                                                          // argIndex
+        local_mem_a_size,                                           // argSize
+        nullptr, // pProperties
+        nullptr, // hArgValue
+    };
+
+    // New local_mem_b at index 1
+    new_value_descs[1] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        1 + hip_arg_offset,                                         // argIndex
+        local_mem_b_size,                                           // argSize
+        nullptr, // pProperties
+        nullptr, // hArgValue
+    };
+
+    // Update kernel inputs
+    ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
+        nullptr,                                                        // pNext
+        kernel,                 // hNewKernel
+        0,                      // numNewMemObjArgs
+        0,                      // numNewPointerArgs
+        new_value_descs.size(), // numNewValueArgs
+        n_dimensions,           // newWorkDim
+        nullptr,                // pNewMemObjArgList
+        nullptr,                // pNewPointerArgList
+        new_value_descs.data(), // pNewValueArgList
+        nullptr,                // pNewGlobalWorkOffset
+        nullptr,                // pNewGlobalWorkSize
+        nullptr,                // pNewLocalWorkSize
+    };
+
+    // Update kernel and enqueue command-buffer again
+    ASSERT_SUCCESS(
+        urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc));
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    // Verify that update occurred correctly
+    Validate(output, X, Y, A, global_size, local_size);
+}
+
 // Test updating A,X,Y parameters to new values and omitting local memory parameters
 // from the update.
 TEST_P(LocalMemoryUpdateTest, UpdateParametersEmptyLocalSize) {

From a5d2bda6e0936059b5a6246c67540bf1ab2973eb Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan.cr@gmail.com>
Date: Tue, 19 Nov 2024 08:56:29 +0000
Subject: [PATCH 31/36] FIx comment typos

Co-authored-by: Ben Tracy <ben.tracy@codeplay.com>
Co-authored-by: aarongreig <aaron.greig@codeplay.com>
---
 source/adapters/cuda/kernel.hpp                       |  2 +-
 source/adapters/hip/kernel.hpp                        |  2 +-
 .../exp_command_buffer/update/local_memory_update.cpp | 11 ++++++-----
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp
index 31088da7cc..f0a554cfce 100644
--- a/source/adapters/cuda/kernel.hpp
+++ b/source/adapters/cuda/kernel.hpp
@@ -167,7 +167,7 @@ struct ur_kernel_handle_t_ {
              AlignedLocalSize);
 
       // For every existing local argument which follows at later argument
-      // indices, updated the offset and pointer into the kernel local memory.
+      // indices, update the offset and pointer into the kernel local memory.
       // Required as padding will need to be recalculated.
       const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg
       for (auto SuccIndex = Index + 1; SuccIndex < NumArgs; SuccIndex++) {
diff --git a/source/adapters/hip/kernel.hpp b/source/adapters/hip/kernel.hpp
index 08441e288f..141b1433fe 100644
--- a/source/adapters/hip/kernel.hpp
+++ b/source/adapters/hip/kernel.hpp
@@ -162,7 +162,7 @@ struct ur_kernel_handle_t_ {
              AlignedLocalSize);
 
       // For every existing local argument which follows at later argument
-      // indices, updated the offset and pointer into the kernel local memory.
+      // indices, update the offset and pointer into the kernel local memory.
       // Required as padding will need to be recalculated.
       const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg
       for (auto SuccIndex = Index + 1; SuccIndex < NumArgs; SuccIndex++) {
diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
index 2d0f24cb4e..60c55d4a4c 100644
--- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp
+++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
@@ -21,7 +21,8 @@ struct LocalMemoryUpdateTestBase
                 << "Local memory argument update not supported on Level Zero.";
         }
 
-        // HIP has extra args for local memory so we define an offset for arg indices here for updating
+        // HIP has extra args for local memory so we define an offset for arg
+        // indices here for updating
         hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0;
         ur_device_usm_access_capability_flags_t shared_usm_flags;
         ASSERT_SUCCESS(
@@ -313,7 +314,7 @@ TEST_P(LocalMemoryUpdateTest, UpdateLocalOnly) {
 // Test updating A,X,Y parameters to new values and omitting local memory parameters
 // from the update.
 TEST_P(LocalMemoryUpdateTest, UpdateParametersEmptyLocalSize) {
-    // Run command-buffer prior to update an verify output
+    // Run command-buffer prior to update and verify output
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
                                              nullptr, nullptr));
     ASSERT_SUCCESS(urQueueFinish(queue));
@@ -547,7 +548,7 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersSmallerLocalSize) {
 // Test updating A,X,Y parameters to new values and local memory parameters
 // to new larger values.
 TEST_P(LocalMemoryUpdateTest, UpdateParametersLargerLocalSize) {
-    // Run command-buffer prior to update an verify output
+    // Run command-buffer prior to update and verify output
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
                                              nullptr, nullptr));
     ASSERT_SUCCESS(urQueueFinish(queue));
@@ -700,10 +701,10 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersLargerLocalSize) {
 }
 
 // Test updating A,X,Y parameters to new values and only one of the local memory
-// parameters, which is set to a new values. Then a separate update call for
+// parameters, which is set to a new value. Then a separate update call for
 // the other local memory argument.
 TEST_P(LocalMemoryUpdateTest, UpdateParametersPartialLocalSize) {
-    // Run command-buffer prior to update an verify output
+    // Run command-buffer prior to update and verify output
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
                                              nullptr, nullptr));
     ASSERT_SUCCESS(urQueueFinish(queue));

From 7d126f30e830fb484ddff78fba8241abf3cc7bb5 Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Tue, 19 Nov 2024 14:02:50 +0000
Subject: [PATCH 32/36] Add non command-buffer test

---
 scripts/core/CUDA.rst                         |  31 +++
 scripts/core/HIP.rst                          |  10 +
 .../kernel/kernel_adapter_native_cpu.match    |   3 +
 .../kernel/urKernelSetArgLocal.cpp            | 200 ++++++++++++++++++
 4 files changed, 244 insertions(+)

diff --git a/scripts/core/CUDA.rst b/scripts/core/CUDA.rst
index 9771693113..d23d217ef4 100644
--- a/scripts/core/CUDA.rst
+++ b/scripts/core/CUDA.rst
@@ -148,6 +148,36 @@ take the extra global offset argument. Use of the global offset is not
 recommended for non SYCL compiler toolchains. This parameter can be ignored if
 the user does not wish to use the global offset.
 
+Local Memory Arguments
+----------------------
+
+In UR local memory is a region of memory shared by all the work-items in
+a work-group. A kernel function signature can include local memory address
+space pointer arguments, which are set by the user with
+``urKernelSetArgLocal`` with the number of bytes of local memory to allocate
+and make available from the pointer argument.
+
+The CUDA adapter implements local memory arguments to a kernel as a single
+``__shared__`` memory allocation, with each local address space pointer argument
+to the kernel converted to a byte offset parameter to the single memory
+allocation. Therefore for ``N`` local arguments that need set on a kernel with
+``urKernelSetArgLocal``, the total aligned size is calculated for the single
+memory allocation by the CUDA adapter and passed as the ``sharedMemBytes``
+argument to ``cuLaunchKernel`` (or variants like ``cuLaunchCooperativeKernel``
+or ``cudaGraphAddKernelNode``).
+
+For each kernel local memory parameter, aligned offsets into the single memory location
+are calculated and passed at runtime via ``kernelParams`` when launching the kernel (or
+adding as a graph node). When a user calls ``urKernelSetArgLocal`` with an
+argument index that has already been set the CUDA adapter recalculates the size of the
+single memory allocation and offsets of any local memory arguments at following indices.
+
+.. warning::
+
+  The CUDA UR adapter implementation of local memory assumes the kernel created
+  has been created by DPC++, instumenting the device code so that local memory
+  arguments are offsets rather than pointers.
+
 Other Notes
 ===========
 
@@ -164,4 +194,5 @@ Contributors
 ------------
 
 * Hugh Delaney `hugh.delaney@codeplay.com <hugh.delaney@codeplay.com>`_
+* Ewan Crawford `ewan@codeplay.com <ewan@codeplay.com>`_
 
diff --git a/scripts/core/HIP.rst b/scripts/core/HIP.rst
index 3ded0138ff..32d387c481 100644
--- a/scripts/core/HIP.rst
+++ b/scripts/core/HIP.rst
@@ -91,6 +91,15 @@ take the extra global offset argument. Use of the global offset is not
 recommended for non SYCL compiler toolchains. This parameter can be ignored if
 the user does not wish to use the global offset.
 
+Local Memory Arguments
+----------------------
+
+.. todo::
+   Copy and update CUDA doc
+
+.. todo::
+   Document what extra args needed on HIP arg with local accessors
+
 Other Notes
 ===========
 
@@ -100,4 +109,5 @@ Contributors
 ------------
 
 * Hugh Delaney `hugh.delaney@codeplay.com <hugh.delaney@codeplay.com>`_
+* Ewan Crawford `ewan@codeplay.com <ewan@codeplay.com>`_
 
diff --git a/test/conformance/kernel/kernel_adapter_native_cpu.match b/test/conformance/kernel/kernel_adapter_native_cpu.match
index 7ca10ec3d2..bd5333c609 100644
--- a/test/conformance/kernel/kernel_adapter_native_cpu.match
+++ b/test/conformance/kernel/kernel_adapter_native_cpu.match
@@ -38,6 +38,9 @@ urKernelRetainTest.InvalidNullHandleKernel/*
 urKernelSetArgLocalTest.Success/*
 urKernelSetArgLocalTest.InvalidNullHandleKernel/*
 urKernelSetArgLocalTest.InvalidKernelArgumentIndex/*
+urKernelSetArgLocalMultiTest.Basic/*
+urKernelSetArgLocalMultiTest.ReLaunch/*
+urKernelSetArgLocalMultiTest.Overwrite/*
 urKernelSetArgMemObjTest.Success/*
 urKernelSetArgMemObjTest.InvalidNullHandleKernel/*
 urKernelSetArgMemObjTest.InvalidKernelArgumentIndex/*
diff --git a/test/conformance/kernel/urKernelSetArgLocal.cpp b/test/conformance/kernel/urKernelSetArgLocal.cpp
index 1d3789bf3a..fcbd5194a7 100644
--- a/test/conformance/kernel/urKernelSetArgLocal.cpp
+++ b/test/conformance/kernel/urKernelSetArgLocal.cpp
@@ -3,6 +3,7 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+#include <cstring>
 #include <uur/fixtures.h>
 
 struct urKernelSetArgLocalTest : uur::urKernelTest {
@@ -32,3 +33,202 @@ TEST_P(urKernelSetArgLocalTest, InvalidKernelArgumentIndex) {
                      urKernelSetArgLocal(kernel, num_kernel_args + 1,
                                          local_mem_size, nullptr));
 }
+
+// Test launching kernels with multiple local arguments return the expected
+// outputs
+struct urKernelSetArgLocalMultiTest : uur::urKernelExecutionTest {
+    void SetUp() override {
+        program_name = "saxpy_usm_local_mem";
+        UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
+
+        ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND,
+                                         sizeof(backend), &backend, nullptr));
+
+        // HIP has extra args for local memory so we define an offset for arg indices here for updating
+        hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0;
+        ur_device_usm_access_capability_flags_t shared_usm_flags;
+        ASSERT_SUCCESS(
+            uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags));
+        if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) {
+            GTEST_SKIP() << "Shared USM is not supported.";
+        }
+
+        const size_t allocation_size =
+            sizeof(uint32_t) * global_size * local_size;
+        for (auto &shared_ptr : shared_ptrs) {
+            ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr,
+                                            allocation_size, &shared_ptr));
+            ASSERT_NE(shared_ptr, nullptr);
+
+            std::vector<uint8_t> pattern(allocation_size);
+            uur::generateMemFillPattern(pattern);
+            std::memcpy(shared_ptr, pattern.data(), allocation_size);
+        }
+        size_t current_index = 0;
+        // Index 0 is local_mem_a arg
+        ASSERT_SUCCESS(urKernelSetArgLocal(kernel, current_index++,
+                                           local_mem_a_size, nullptr));
+
+        // Hip has extra args for local mem at index 1-3
+        if (backend == UR_PLATFORM_BACKEND_HIP) {
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                               sizeof(local_size), nullptr,
+                                               &local_size));
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                               sizeof(local_size), nullptr,
+                                               &local_size));
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                               sizeof(local_size), nullptr,
+                                               &local_size));
+        }
+
+        // Index 1 is local_mem_b arg
+        ASSERT_SUCCESS(urKernelSetArgLocal(kernel, current_index++,
+                                           local_mem_b_size, nullptr));
+        if (backend == UR_PLATFORM_BACKEND_HIP) {
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                               sizeof(local_size), nullptr,
+                                               &local_size));
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                               sizeof(local_size), nullptr,
+                                               &local_size));
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                               sizeof(local_size), nullptr,
+                                               &local_size));
+        }
+
+        // Index 2 is output
+        ASSERT_SUCCESS(urKernelSetArgPointer(kernel, current_index++, nullptr,
+                                             shared_ptrs[0]));
+        // Index 3 is A
+        ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, sizeof(A),
+                                           nullptr, &A));
+        // Index 4 is X
+        ASSERT_SUCCESS(urKernelSetArgPointer(kernel, current_index++, nullptr,
+                                             shared_ptrs[1]));
+        // Index 5 is Y
+        ASSERT_SUCCESS(urKernelSetArgPointer(kernel, current_index++, nullptr,
+                                             shared_ptrs[2]));
+    }
+
+    void Validate(uint32_t *output, uint32_t *X, uint32_t *Y, uint32_t A,
+                  size_t length, size_t local_size) {
+        for (size_t i = 0; i < length; i++) {
+            uint32_t result = A * X[i] + Y[i] + local_size;
+            ASSERT_EQ(result, output[i]);
+        }
+    }
+
+    virtual void TearDown() override {
+        for (auto &shared_ptr : shared_ptrs) {
+            if (shared_ptr) {
+                EXPECT_SUCCESS(urUSMFree(context, shared_ptr));
+            }
+        }
+
+        UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::TearDown());
+    }
+
+    static constexpr size_t local_size = 4;
+    static constexpr size_t local_mem_a_size = local_size * sizeof(uint32_t);
+    static constexpr size_t local_mem_b_size = local_mem_a_size * 2;
+    static constexpr size_t global_size = 16;
+    static constexpr size_t global_offset = 0;
+    static constexpr size_t n_dimensions = 1;
+    static constexpr uint32_t A = 42;
+    std::array<void *, 5> shared_ptrs = {nullptr, nullptr, nullptr, nullptr,
+                                         nullptr};
+
+    uint32_t hip_arg_offset = 0;
+    ur_platform_backend_t backend{};
+};
+UUR_INSTANTIATE_KERNEL_TEST_SUITE_P(urKernelSetArgLocalMultiTest);
+
+TEST_P(urKernelSetArgLocalMultiTest, Basic) {
+    ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
+                                         &global_offset, &global_size,
+                                         &local_size, 0, nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    uint32_t *output = (uint32_t *)shared_ptrs[0];
+    uint32_t *X = (uint32_t *)shared_ptrs[1];
+    uint32_t *Y = (uint32_t *)shared_ptrs[2];
+    Validate(output, X, Y, A, global_size, local_size);
+}
+
+TEST_P(urKernelSetArgLocalMultiTest, ReLaunch) {
+    ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
+                                         &global_offset, &global_size,
+                                         &local_size, 0, nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    uint32_t *output = (uint32_t *)shared_ptrs[0];
+    uint32_t *X = (uint32_t *)shared_ptrs[1];
+    uint32_t *Y = (uint32_t *)shared_ptrs[2];
+    Validate(output, X, Y, A, global_size, local_size);
+
+    // Relaunch with new arguments
+    ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
+                                         &global_offset, &global_size,
+                                         &local_size, 0, nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+    uint32_t *new_output = (uint32_t *)shared_ptrs[0];
+    uint32_t *new_X = (uint32_t *)shared_ptrs[3];
+    uint32_t *new_Y = (uint32_t *)shared_ptrs[4];
+    Validate(new_output, new_X, new_Y, A, global_size, local_size);
+}
+
+// Overwrite local args to a larger value, then reset back to original
+TEST_P(urKernelSetArgLocalMultiTest, Overwrite) {
+    ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
+                                         &global_offset, &global_size,
+                                         &local_size, 0, nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    uint32_t *output = (uint32_t *)shared_ptrs[0];
+    uint32_t *X = (uint32_t *)shared_ptrs[1];
+    uint32_t *Y = (uint32_t *)shared_ptrs[2];
+    Validate(output, X, Y, A, global_size, local_size);
+
+    size_t new_local_size = 2;
+    size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t);
+    size_t new_local_mem_b_size = new_local_size * sizeof(uint32_t) * 2;
+    size_t current_index = 0;
+    ASSERT_SUCCESS(urKernelSetArgLocal(kernel, current_index++,
+                                       new_local_mem_a_size, nullptr));
+
+    // Hip has extra args for local mem at index 1-3
+    if (backend == UR_PLATFORM_BACKEND_HIP) {
+        ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                           sizeof(new_local_size), nullptr,
+                                           &new_local_size));
+        ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                           sizeof(new_local_size), nullptr,
+                                           &new_local_size));
+        ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                           sizeof(new_local_size), nullptr,
+                                           &new_local_size));
+    }
+
+    // Index 1 is local_mem_b arg
+    ASSERT_SUCCESS(urKernelSetArgLocal(kernel, current_index++,
+                                       new_local_mem_b_size, nullptr));
+    if (backend == UR_PLATFORM_BACKEND_HIP) {
+        ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                           sizeof(new_local_size), nullptr,
+                                           &new_local_size));
+        ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                           sizeof(new_local_size), nullptr,
+                                           &new_local_size));
+        ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                           sizeof(new_local_size), nullptr,
+                                           &new_local_size));
+    }
+
+    ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
+                                         &global_offset, &global_size,
+                                         &new_local_size, 0, nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    Validate(output, X, Y, A, global_size, new_local_size);
+}

From e578228aa259c84f0c909b2993ca6659a0dfcd86 Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Wed, 20 Nov 2024 12:41:46 +0000
Subject: [PATCH 33/36] Document hip extra arg behavior

---
 scripts/core/CUDA.rst                         |  27 +--
 scripts/core/HIP.rst                          |  41 ++++-
 source/adapters/cuda/kernel.hpp               |   6 +-
 source/adapters/hip/kernel.hpp                |   6 +-
 .../update/local_memory_update.cpp            | 169 +++++++++---------
 .../kernel/urKernelSetArgLocal.cpp            |  49 ++---
 6 files changed, 165 insertions(+), 133 deletions(-)

diff --git a/scripts/core/CUDA.rst b/scripts/core/CUDA.rst
index d23d217ef4..08b61bf9dc 100644
--- a/scripts/core/CUDA.rst
+++ b/scripts/core/CUDA.rst
@@ -157,25 +157,28 @@ space pointer arguments, which are set by the user with
 ``urKernelSetArgLocal`` with the number of bytes of local memory to allocate
 and make available from the pointer argument.
 
-The CUDA adapter implements local memory arguments to a kernel as a single
-``__shared__`` memory allocation, with each local address space pointer argument
-to the kernel converted to a byte offset parameter to the single memory
-allocation. Therefore for ``N`` local arguments that need set on a kernel with
-``urKernelSetArgLocal``, the total aligned size is calculated for the single
+The CUDA adapter implements local memory in a kernel as a single ``__shared__``
+memory allocation, and each individual local memory argument is a ``u32`` byte
+offset kernel parameter which is combined inside the kernel with the
+``__shared__`` memory allocation. Therefore for ``N`` local arguments that need
+set on a kernel with ``urKernelSetArgLocal``, the total aligned size across the
+``N`` calls to ``urKernelSetArgLocal`` is calculated for the ``__shared__``
 memory allocation by the CUDA adapter and passed as the ``sharedMemBytes``
 argument to ``cuLaunchKernel`` (or variants like ``cuLaunchCooperativeKernel``
-or ``cudaGraphAddKernelNode``).
+or ``cuGraphAddKernelNode``).
 
-For each kernel local memory parameter, aligned offsets into the single memory location
-are calculated and passed at runtime via ``kernelParams`` when launching the kernel (or
-adding as a graph node). When a user calls ``urKernelSetArgLocal`` with an
-argument index that has already been set the CUDA adapter recalculates the size of the
-single memory allocation and offsets of any local memory arguments at following indices.
+For each kernel ``u32`` local memory offset parameter, aligned offsets into the
+single memory location are calculated and passed at runtime by the adapter via
+``kernelParams`` when launching the kernel (or adding the kernel as a graph
+node). When a user calls ``urKernelSetArgLocal`` with an argument index that
+has already been set on the kernel, the adapter recalculates the size of the
+``__shared__`` memory allocation and offset for the index, as well as the
+offsets of any local memory arguments at following indices.
 
 .. warning::
 
   The CUDA UR adapter implementation of local memory assumes the kernel created
-  has been created by DPC++, instumenting the device code so that local memory
+  has been created by DPC++, instrumenting the device code so that local memory
   arguments are offsets rather than pointers.
 
 Other Notes
diff --git a/scripts/core/HIP.rst b/scripts/core/HIP.rst
index 32d387c481..920a5f5a3e 100644
--- a/scripts/core/HIP.rst
+++ b/scripts/core/HIP.rst
@@ -94,11 +94,42 @@ the user does not wish to use the global offset.
 Local Memory Arguments
 ----------------------
 
-.. todo::
-   Copy and update CUDA doc
-
-.. todo::
-   Document what extra args needed on HIP arg with local accessors
+In UR local memory is a region of memory shared by all the work-items in
+a work-group. A kernel function signature can include local memory address
+space pointer arguments, which are set by the user with
+``urKernelSetArgLocal`` with the number of bytes of local memory to allocate
+and make available from the pointer argument.
+
+The HIP adapter implements local memory in a kernel as a single ``__shared__``
+memory allocation, and each individual local memory argument is a ``u32`` byte
+offset kernel parameter which is combined inside the kernel with the
+``__shared__`` memory allocation. Therefore for ``N`` local arguments that need
+set on a kernel with ``urKernelSetArgLocal``, the total aligned size across the
+``N`` calls to ``urKernelSetArgLocal`` is calculated for the ``__shared__``
+memory allocation by the HIP adapter and passed as the ``sharedMemBytes``
+argument to ``hipModuleLaunchKernel`` or ``hipGraphAddKernelNode``.
+
+For each kernel ``u32`` local memory offset parameter, aligned offsets into the
+single memory location are calculated and passed at runtime by the adapter via
+``kernelParams`` when launching the kernel (or adding the kernel as a graph
+node). When a user calls ``urKernelSetArgLocal`` with an argument index that
+has already been set on the kernel, the adapter recalculates the size of the
+``__shared__`` memory allocation and offset for the index, as well as the
+offsets of any local memory arguments at following indices.
+
+.. warning::
+
+  The HIP UR adapter implementation of local memory assumes the kernel created
+  has been created by DPC++, instrumenting the device code so that local memory
+  arguments are offsets rather than pointers.
+
+
+HIP kernels that are generated for DPC++ kernels with SYCL local accessors
+contain extra value arguments on top of the local memory argument for the
+local accessor. For each ``urKernelSetArgLocal`` argument, a user needs
+to make 3 calls to ``urKernelSetArgValue`` with each of the next 3 consecutive
+argument indexes. This represents a 3 dimensional offset into the local
+accessor.
 
 Other Notes
 ===========
diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp
index f0a554cfce..2b04dfba43 100644
--- a/source/adapters/cuda/kernel.hpp
+++ b/source/adapters/cuda/kernel.hpp
@@ -158,8 +158,7 @@ struct ur_kernel_handle_t_ {
 
     void addLocalArg(size_t Index, size_t Size) {
       // Get the aligned argument size and offset into local data
-      size_t AlignedLocalSize, AlignedLocalOffset;
-      std::tie(AlignedLocalSize, AlignedLocalOffset) =
+      auto [AlignedLocalSize, AlignedLocalOffset] =
           calcAlignedLocalArgument(Index, Size);
 
       // Store argument details
@@ -178,8 +177,7 @@ struct ur_kernel_handle_t_ {
         }
 
         // Recalculate alignment
-        size_t SuccAlignedLocalSize, SuccAlignedLocalOffset;
-        std::tie(SuccAlignedLocalSize, SuccAlignedLocalOffset) =
+        auto [SuccAlignedLocalSize, SuccAlignedLocalOffset] =
             calcAlignedLocalArgument(SuccIndex, OriginalLocalSize);
 
         // Store new local memory size
diff --git a/source/adapters/hip/kernel.hpp b/source/adapters/hip/kernel.hpp
index 141b1433fe..c6d30e81ad 100644
--- a/source/adapters/hip/kernel.hpp
+++ b/source/adapters/hip/kernel.hpp
@@ -153,8 +153,7 @@ struct ur_kernel_handle_t_ {
 
     void addLocalArg(size_t Index, size_t Size) {
       // Get the aligned argument size and offset into local data
-      size_t AlignedLocalSize, AlignedLocalOffset;
-      std::tie(AlignedLocalSize, AlignedLocalOffset) =
+      auto [AlignedLocalSize, AlignedLocalOffset] =
           calcAlignedLocalArgument(Index, Size);
 
       // Store argument details
@@ -173,8 +172,7 @@ struct ur_kernel_handle_t_ {
         }
 
         // Recalculate alignment
-        size_t SuccAlignedLocalSize, SuccAlignedLocalOffset;
-        std::tie(SuccAlignedLocalSize, SuccAlignedLocalOffset) =
+        auto [SuccAlignedLocalSize, SuccAlignedLocalOffset] =
             calcAlignedLocalArgument(SuccIndex, OriginalLocalSize);
 
         // Store new local memory size
diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
index 60c55d4a4c..c467c9783a 100644
--- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp
+++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
@@ -50,14 +50,14 @@ struct LocalMemoryUpdateTestBase
         // Hip has extra args for local mem at index 1-3
         if (backend == UR_PLATFORM_BACKEND_HIP) {
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
         }
 
         // Index 1 is local_mem_b arg
@@ -65,14 +65,14 @@ struct LocalMemoryUpdateTestBase
                                            local_mem_b_size, nullptr));
         if (backend == UR_PLATFORM_BACKEND_HIP) {
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
         }
 
         // Index 2 is output
@@ -119,6 +119,7 @@ struct LocalMemoryUpdateTestBase
                                          nullptr};
 
     uint32_t hip_arg_offset = 0;
+    static constexpr uint64_t hip_local_offset = 0;
 };
 
 struct LocalMemoryUpdateTest : LocalMemoryUpdateTestBase {
@@ -424,26 +425,26 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersSmallerLocalSize) {
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            1,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            1,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            2,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            2,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            3,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            3,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
     }
 
@@ -462,26 +463,26 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersSmallerLocalSize) {
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            5,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            5,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            6,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            6,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            7,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            7,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
     }
 
@@ -579,26 +580,26 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersLargerLocalSize) {
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            1,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            1,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            2,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            2,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            3,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            3,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
     }
 
@@ -617,26 +618,26 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersLargerLocalSize) {
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            5,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            5,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            6,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            6,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            7,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            7,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
     }
 
@@ -735,26 +736,26 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersPartialLocalSize) {
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            1,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            1,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            2,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            2,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            3,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            3,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
     }
 
@@ -826,26 +827,26 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersPartialLocalSize) {
         second_update_value_args.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            5,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            5,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         second_update_value_args.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            6,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            6,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         second_update_value_args.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            7,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            7,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
     }
 
diff --git a/test/conformance/kernel/urKernelSetArgLocal.cpp b/test/conformance/kernel/urKernelSetArgLocal.cpp
index fcbd5194a7..380085bd16 100644
--- a/test/conformance/kernel/urKernelSetArgLocal.cpp
+++ b/test/conformance/kernel/urKernelSetArgLocal.cpp
@@ -72,14 +72,14 @@ struct urKernelSetArgLocalMultiTest : uur::urKernelExecutionTest {
         // Hip has extra args for local mem at index 1-3
         if (backend == UR_PLATFORM_BACKEND_HIP) {
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
         }
 
         // Index 1 is local_mem_b arg
@@ -87,14 +87,14 @@ struct urKernelSetArgLocalMultiTest : uur::urKernelExecutionTest {
                                            local_mem_b_size, nullptr));
         if (backend == UR_PLATFORM_BACKEND_HIP) {
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
         }
 
         // Index 2 is output
@@ -140,6 +140,7 @@ struct urKernelSetArgLocalMultiTest : uur::urKernelExecutionTest {
                                          nullptr};
 
     uint32_t hip_arg_offset = 0;
+    static constexpr uint64_t hip_local_offset = 0;
     ur_platform_backend_t backend{};
 };
 UUR_INSTANTIATE_KERNEL_TEST_SUITE_P(urKernelSetArgLocalMultiTest);
@@ -200,14 +201,14 @@ TEST_P(urKernelSetArgLocalMultiTest, Overwrite) {
     // Hip has extra args for local mem at index 1-3
     if (backend == UR_PLATFORM_BACKEND_HIP) {
         ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                           sizeof(new_local_size), nullptr,
-                                           &new_local_size));
+                                           sizeof(hip_local_offset), nullptr,
+                                           &hip_local_offset));
         ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                           sizeof(new_local_size), nullptr,
-                                           &new_local_size));
+                                           sizeof(hip_local_offset), nullptr,
+                                           &hip_local_offset));
         ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                           sizeof(new_local_size), nullptr,
-                                           &new_local_size));
+                                           sizeof(hip_local_offset), nullptr,
+                                           &hip_local_offset));
     }
 
     // Index 1 is local_mem_b arg
@@ -215,14 +216,14 @@ TEST_P(urKernelSetArgLocalMultiTest, Overwrite) {
                                        new_local_mem_b_size, nullptr));
     if (backend == UR_PLATFORM_BACKEND_HIP) {
         ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                           sizeof(new_local_size), nullptr,
-                                           &new_local_size));
+                                           sizeof(hip_local_offset), nullptr,
+                                           &hip_local_offset));
         ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                           sizeof(new_local_size), nullptr,
-                                           &new_local_size));
+                                           sizeof(hip_local_offset), nullptr,
+                                           &hip_local_offset));
         ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                           sizeof(new_local_size), nullptr,
-                                           &new_local_size));
+                                           sizeof(hip_local_offset), nullptr,
+                                           &hip_local_offset));
     }
 
     ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,

From 850e85c7770cdc6fbd603c9d275f47ebbdfa1205 Mon Sep 17 00:00:00 2001
From: "Kenneth Benzie (Benie)" <k.benzie@codeplay.com>
Date: Mon, 2 Dec 2024 15:25:59 +0000
Subject: [PATCH 34/36] Fix L0 match entry for
 urEnqueueEventsWaitMultiDeviceMTTest

The match entry for `urEnqueueEventsWaitMultiDeviceMTTest` previously
matches `urEnqueueEventsWaitMultiDeviceMTTest/*` but this doesn't
actually match the test names which are of the form
`urEnqueueEventsWaitMultiDeviceMTTest.<TestName>/...`. This patch
removes the `/` in order to match all tests using this fixture name.

Failures like the following have been occurring in unrelated PR's today:

```
[ RUN      ] urEnqueueEventsWaitMultiDeviceMTTest.EnqueueWaitSingleQueueMultiOps/MultiThread
/home/test-user/actions-runner/_work/unified-runtime/unified-runtime/test/conformance/enqueue/urEnqueueEventsWaitMultiDevice.cpp:54: Failure
Expected equality of these values:
  reinterpret_cast<uint32_t *>(ptr)[i]
    Which is: 0
  pattern
    Which is: 42
[  FAILED  ] urEnqueueEventsWaitMultiDeviceMTTest.EnqueueWaitSingleQueueMultiOps/MultiThread, where GetParam() = 40-byte object
```
---
 test/conformance/enqueue/enqueue_adapter_level_zero.match | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/conformance/enqueue/enqueue_adapter_level_zero.match b/test/conformance/enqueue/enqueue_adapter_level_zero.match
index 7e3405a963..4155859eaf 100644
--- a/test/conformance/enqueue/enqueue_adapter_level_zero.match
+++ b/test/conformance/enqueue/enqueue_adapter_level_zero.match
@@ -20,7 +20,7 @@
 {{OPT}}urEnqueueMemImageReadTest.InvalidOrigin1D/*
 {{OPT}}urEnqueueMemImageReadTest.InvalidOrigin2D/*
 {{OPT}}urEnqueueMemImageReadTest.InvalidOrigin3D/*
-{{OPT}}urEnqueueEventsWaitMultiDeviceMTTest/*
+{{OPT}}urEnqueueEventsWaitMultiDeviceMTTest*
 {{OPT}}urEnqueueEventsWaitWithBarrierOrderingTest.SuccessEventDependencies/*
 {{OPT}}urEnqueueEventsWaitWithBarrierOrderingTest.SuccessEventDependenciesBarrierOnly/*
 {{OPT}}urEnqueueEventsWaitWithBarrierOrderingTest.SuccessEventDependenciesLaunchOnly/*

From acca3a2d9343edd8f986df08821cfb51f9a8733b Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Mon, 2 Dec 2024 17:56:21 +0000
Subject: [PATCH 35/36] [Benchmarks] fix label for comput benchmark

---
 scripts/benchmarks/benches/compute.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/benchmarks/benches/compute.py b/scripts/benchmarks/benches/compute.py
index f872399e9e..c00ec2ec52 100644
--- a/scripts/benchmarks/benches/compute.py
+++ b/scripts/benchmarks/benches/compute.py
@@ -115,7 +115,7 @@ def run(self, env_vars) -> list[Result]:
         parsed_results = self.parse_output(result)
         ret = []
         for label, mean, unit in parsed_results:
-            extra_label = " CPU count" if parse_unit_type(unit) == "CPU count" else ""
+            extra_label = " CPU count" if parse_unit_type(unit) == "instr" else ""
             ret.append(Result(label=self.name() + extra_label, value=mean, command=command, env=env_vars, stdout=result, unit=parse_unit_type(unit)))
         return ret
 

From 3ff07309e1ce2055b8a2a2133ea55594e55ea6ef Mon Sep 17 00:00:00 2001
From: "Kenneth Benzie (Benie)" <k.benzie@codeplay.com>
Date: Mon, 2 Dec 2024 11:38:04 +0000
Subject: [PATCH 36/36] Add UR_ADAPTER_INFO_VERSION query

In order to support in-code management of known failures we need a query
to differentiate between the two versions of the Level Zero adapter.
This patch adds the `UR_ADAPTER_INFO_VERSION` query which returns a
`uint32_t` which is set to return `1` for all adapters except the Level
Zero adapter when being compiled in V2 mode.
---
 include/ur_api.h                              |  5 ++++-
 include/ur_print.hpp                          | 15 +++++++++++++++
 scripts/core/adapter.yml                      |  5 +++++
 source/adapters/cuda/adapter.cpp              |  2 ++
 source/adapters/hip/adapter.cpp               |  2 ++
 source/adapters/level_zero/adapter.cpp        |  8 ++++++++
 source/adapters/native_cpu/adapter.cpp        |  2 ++
 source/adapters/opencl/adapter.cpp            |  2 ++
 source/loader/layers/validation/ur_valddi.cpp |  2 +-
 source/loader/ur_libapi.cpp                   |  2 +-
 source/ur_api.cpp                             |  2 +-
 test/conformance/adapter/urAdapterGetInfo.cpp |  4 +++-
 tools/urinfo/urinfo.hpp                       |  2 ++
 13 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/include/ur_api.h b/include/ur_api.h
index eb8b07221c..9890291e36 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -967,6 +967,9 @@ typedef enum ur_adapter_info_t {
                                          ///< The reference count returned should be considered immediately stale.
                                          ///< It is unsuitable for general use in applications. This feature is
                                          ///< provided for identifying memory leaks.
+    UR_ADAPTER_INFO_VERSION = 2,         ///< [uint32_t] Specifies the adapter version, initial value of 1 and
+                                         ///< incremented unpon major changes, e.g. when multiple versions of an
+                                         ///< adapter may exist in parallel.
     /// @cond
     UR_ADAPTER_INFO_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -988,7 +991,7 @@ typedef enum ur_adapter_info_t {
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hAdapter`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_ADAPTER_INFO_REFERENCE_COUNT < propName`
+///         + `::UR_ADAPTER_INFO_VERSION < propName`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
 ///         + If `propName` is not supported by the adapter.
 ///     - ::UR_RESULT_ERROR_INVALID_SIZE
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index 8888a74f91..cdc8a36243 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -1922,6 +1922,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_adapter_info_t value)
     case UR_ADAPTER_INFO_REFERENCE_COUNT:
         os << "UR_ADAPTER_INFO_REFERENCE_COUNT";
         break;
+    case UR_ADAPTER_INFO_VERSION:
+        os << "UR_ADAPTER_INFO_VERSION";
+        break;
     default:
         os << "unknown enumerator";
         break;
@@ -1962,6 +1965,18 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_adapter_inf
 
         os << ")";
     } break;
+    case UR_ADAPTER_INFO_VERSION: {
+        const uint32_t *tptr = (const uint32_t *)ptr;
+        if (sizeof(uint32_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(uint32_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        os << *tptr;
+
+        os << ")";
+    } break;
     default:
         os << "unknown enumerator";
         return UR_RESULT_ERROR_INVALID_ENUMERATION;
diff --git a/scripts/core/adapter.yml b/scripts/core/adapter.yml
index a4eddd823c..4fc9a104ed 100644
--- a/scripts/core/adapter.yml
+++ b/scripts/core/adapter.yml
@@ -136,6 +136,11 @@ etors:
             [uint32_t] Reference count of the adapter.
             The reference count returned should be considered immediately stale.
             It is unsuitable for general use in applications. This feature is provided for identifying memory leaks.
+    - name: VERSION
+      desc: >
+          [uint32_t] Specifies the adapter version, initial value of 1 and
+          incremented unpon major changes, e.g. when multiple versions of an
+          adapter may exist in parallel.
 --- #--------------------------------------------------------------------------
 type: function
 desc: "Retrieves information about the adapter"
diff --git a/source/adapters/cuda/adapter.cpp b/source/adapters/cuda/adapter.cpp
index c8949cd9a8..49bb964f8e 100644
--- a/source/adapters/cuda/adapter.cpp
+++ b/source/adapters/cuda/adapter.cpp
@@ -108,6 +108,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t,
     return ReturnValue(UR_ADAPTER_BACKEND_CUDA);
   case UR_ADAPTER_INFO_REFERENCE_COUNT:
     return ReturnValue(adapter.RefCount.load());
+  case UR_ADAPTER_INFO_VERSION:
+    return ReturnValue(uint32_t{1});
   default:
     return UR_RESULT_ERROR_INVALID_ENUMERATION;
   }
diff --git a/source/adapters/hip/adapter.cpp b/source/adapters/hip/adapter.cpp
index 99db21695f..1bfe498bf6 100644
--- a/source/adapters/hip/adapter.cpp
+++ b/source/adapters/hip/adapter.cpp
@@ -96,6 +96,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t,
     return ReturnValue(UR_ADAPTER_BACKEND_HIP);
   case UR_ADAPTER_INFO_REFERENCE_COUNT:
     return ReturnValue(adapter.RefCount.load());
+  case UR_ADAPTER_INFO_VERSION:
+    return ReturnValue(uint32_t{1});
   default:
     return UR_RESULT_ERROR_INVALID_ENUMERATION;
   }
diff --git a/source/adapters/level_zero/adapter.cpp b/source/adapters/level_zero/adapter.cpp
index f2ac67ddff..5d6583f3d4 100644
--- a/source/adapters/level_zero/adapter.cpp
+++ b/source/adapters/level_zero/adapter.cpp
@@ -655,6 +655,14 @@ ur_result_t urAdapterGetInfo(ur_adapter_handle_t, ur_adapter_info_t PropName,
     return ReturnValue(UR_ADAPTER_BACKEND_LEVEL_ZERO);
   case UR_ADAPTER_INFO_REFERENCE_COUNT:
     return ReturnValue(GlobalAdapter->RefCount.load());
+  case UR_ADAPTER_INFO_VERSION: {
+#ifdef UR_ADAPTER_LEVEL_ZERO_V2
+    uint32_t adapterVersion = 2;
+#else
+    uint32_t adapterVersion = 1;
+#endif
+    return ReturnValue(adapterVersion);
+  }
   default:
     return UR_RESULT_ERROR_INVALID_ENUMERATION;
   }
diff --git a/source/adapters/native_cpu/adapter.cpp b/source/adapters/native_cpu/adapter.cpp
index 2b5b95ccd0..727c3e3dba 100644
--- a/source/adapters/native_cpu/adapter.cpp
+++ b/source/adapters/native_cpu/adapter.cpp
@@ -57,6 +57,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t,
     return ReturnValue(UR_ADAPTER_BACKEND_NATIVE_CPU);
   case UR_ADAPTER_INFO_REFERENCE_COUNT:
     return ReturnValue(Adapter.RefCount.load());
+  case UR_ADAPTER_INFO_VERSION:
+    return ReturnValue(uint32_t{1});
   default:
     return UR_RESULT_ERROR_INVALID_ENUMERATION;
   }
diff --git a/source/adapters/opencl/adapter.cpp b/source/adapters/opencl/adapter.cpp
index bf81f6bdaf..162bc59b6a 100644
--- a/source/adapters/opencl/adapter.cpp
+++ b/source/adapters/opencl/adapter.cpp
@@ -128,6 +128,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t,
     return ReturnValue(UR_ADAPTER_BACKEND_OPENCL);
   case UR_ADAPTER_INFO_REFERENCE_COUNT:
     return ReturnValue(adapter->RefCount.load());
+  case UR_ADAPTER_INFO_VERSION:
+    return ReturnValue(uint32_t{1});
   default:
     return UR_RESULT_ERROR_INVALID_ENUMERATION;
   }
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index b3969de10f..ebdb4b5384 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -182,7 +182,7 @@ __urdlllocal ur_result_t UR_APICALL urAdapterGetInfo(
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
 
-        if (UR_ADAPTER_INFO_REFERENCE_COUNT < propName) {
+        if (UR_ADAPTER_INFO_VERSION < propName) {
             return UR_RESULT_ERROR_INVALID_ENUMERATION;
         }
 
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index 3340363737..898f259e0b 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -451,7 +451,7 @@ ur_result_t UR_APICALL urAdapterGetLastError(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hAdapter`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_ADAPTER_INFO_REFERENCE_COUNT < propName`
+///         + `::UR_ADAPTER_INFO_VERSION < propName`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
 ///         + If `propName` is not supported by the adapter.
 ///     - ::UR_RESULT_ERROR_INVALID_SIZE
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index 853d61472e..4a57f5467d 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -410,7 +410,7 @@ ur_result_t UR_APICALL urAdapterGetLastError(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hAdapter`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_ADAPTER_INFO_REFERENCE_COUNT < propName`
+///         + `::UR_ADAPTER_INFO_VERSION < propName`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
 ///         + If `propName` is not supported by the adapter.
 ///     - ::UR_RESULT_ERROR_INVALID_SIZE
diff --git a/test/conformance/adapter/urAdapterGetInfo.cpp b/test/conformance/adapter/urAdapterGetInfo.cpp
index 4dff3ce4dc..63c3cbfca1 100644
--- a/test/conformance/adapter/urAdapterGetInfo.cpp
+++ b/test/conformance/adapter/urAdapterGetInfo.cpp
@@ -20,12 +20,14 @@ struct urAdapterGetInfoTest : uur::runtime::urAdapterTest,
 
 std::unordered_map<ur_adapter_info_t, size_t> adapter_info_size_map = {
     {UR_ADAPTER_INFO_BACKEND, sizeof(ur_adapter_backend_t)},
+    {UR_ADAPTER_INFO_VERSION, sizeof(uint32_t)},
     {UR_ADAPTER_INFO_REFERENCE_COUNT, sizeof(uint32_t)},
 };
 
 INSTANTIATE_TEST_SUITE_P(
     urAdapterGetInfo, urAdapterGetInfoTest,
-    ::testing::Values(UR_ADAPTER_INFO_BACKEND, UR_ADAPTER_INFO_REFERENCE_COUNT),
+    ::testing::Values(UR_ADAPTER_INFO_BACKEND, UR_ADAPTER_INFO_VERSION,
+                      UR_ADAPTER_INFO_REFERENCE_COUNT),
     [](const ::testing::TestParamInfo<ur_adapter_info_t> &info) {
         std::stringstream ss;
         ss << info.param;
diff --git a/tools/urinfo/urinfo.hpp b/tools/urinfo/urinfo.hpp
index 37c7a80328..813ca34da1 100644
--- a/tools/urinfo/urinfo.hpp
+++ b/tools/urinfo/urinfo.hpp
@@ -28,6 +28,8 @@ inline void printAdapterInfos(ur_adapter_handle_t hAdapter,
                               std::string_view prefix = "  ") {
     std::cout << prefix;
     printAdapterInfo<ur_adapter_backend_t>(hAdapter, UR_ADAPTER_INFO_BACKEND);
+    std::cout << prefix;
+    printAdapterInfo<uint32_t>(hAdapter, UR_ADAPTER_INFO_VERSION);
 }
 
 inline void printPlatformInfos(ur_platform_handle_t hPlatform,