oneapi-src · PietroGhg · Oct 11, 2024
@@ -138,13 +138,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
 #else
   bool isLocalSizeOne =
       ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1;
-  if (isLocalSizeOne && ndr.GlobalSize[0] > numParallelThreads) {
-    // If the local size is one, we make the assumption that we are running a
-    // parallel_for over a sycl::range.
-    // Todo: we could add compiler checks and
-    // kernel properties for this (e.g. check that no barriers are called, no
-    // local memory args).
-
+  if (isLocalSizeOne && !hKernel->isNDRangeKernel()) {
     // Todo: this assumes that dim 0 is the best dimension over which we want to
     // parallelize
 
@@ -153,8 +147,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     // divide the global range by the number of threads, set that as the local
     // size and peel everything else.
 
-    size_t new_num_work_groups_0 = numParallelThreads;
-    size_t itemsPerThread = ndr.GlobalSize[0] / numParallelThreads;
+    size_t new_num_work_groups_0 =
+        ndr.GlobalSize[0] > numParallelThreads ? numParallelThreads : 1;
+    size_t itemsPerThread = ndr.GlobalSize[0] > numParallelThreads
+                                ? ndr.GlobalSize[0] / numParallelThreads
+                                : ndr.GlobalSize[0];
 
     for (unsigned g2 = 0; g2 < numWG2; g2++) {
       for (unsigned g1 = 0; g1 < numWG1; g1++) {

@@ -47,8 +47,13 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName,
   if (auto MaxLIt = MaxLinMap.find(pKernelName); MaxLIt != MaxLinMap.end()) {
     MaxLinearWG = MaxLIt->second;
   }
+  bool IsNDRangeKernel = false;
+  if (auto isNDIt = hProgram->KernelIsNDRangeMD.find(pKernelName);
+      isNDIt != hProgram->KernelIsNDRangeMD.end()) {
+    IsNDRangeKernel = isNDIt->second;
+  }
   kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f, ReqdWG, MaxWG,
-                                   MaxLinearWG);
+                                   MaxLinearWG, IsNDRangeKernel);
 
   *phKernel = kernel;
 

@@ -28,16 +28,12 @@ struct local_arg_info_t {
 
 struct ur_kernel_handle_t_ : RefCounted {
 
-  ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
-                      nativecpu_task_t subhandler)
-      : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)} {}
-
   ur_kernel_handle_t_(const ur_kernel_handle_t_ &other)
       : Args(other.Args), hProgram(other.hProgram), _name(other._name),
         _subhandler(other._subhandler), _localArgInfo(other._localArgInfo),
         _localMemPool(other._localMemPool),
         _localMemPoolSize(other._localMemPoolSize),
-        ReqdWGSize(other.ReqdWGSize) {
+        ReqdWGSize(other.ReqdWGSize), NDRangeKernel(other.NDRangeKernel) {
     incrementReferenceCount();
   }
 
@@ -52,10 +48,11 @@ struct ur_kernel_handle_t_ : RefCounted {
                       nativecpu_task_t subhandler,
                       std::optional<native_cpu::WGSize_t> ReqdWGSize,
                       std::optional<native_cpu::WGSize_t> MaxWGSize,
-                      std::optional<uint64_t> MaxLinearWGSize)
+                      std::optional<uint64_t> MaxLinearWGSize,
+                      bool isNDRangeKernel)
       : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)},
         ReqdWGSize(ReqdWGSize), MaxWGSize(MaxWGSize),
-        MaxLinearWGSize(MaxLinearWGSize) {}
+        MaxLinearWGSize(MaxLinearWGSize), NDRangeKernel(isNDRangeKernel) {}
 
   struct arguments {
     using args_index_t = std::vector<void *>;
@@ -162,10 +159,13 @@ struct ur_kernel_handle_t_ : RefCounted {
 
   void addPtrArg(void *Ptr, size_t Index) { Args.addPtrArg(Index, Ptr); }
 
+  bool isNDRangeKernel() const { return NDRangeKernel; }
+
 private:
   char *_localMemPool = nullptr;
   size_t _localMemPoolSize = 0;
   std::optional<native_cpu::WGSize_t> ReqdWGSize = std::nullopt;
   std::optional<native_cpu::WGSize_t> MaxWGSize = std::nullopt;
   std::optional<uint64_t> MaxLinearWGSize = std::nullopt;
+  const bool NDRangeKernel = false;
 };
@@ -8,6 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ur/ur.hpp"
 #include "ur_api.h"
 
 #include "common.hpp"
@@ -99,14 +100,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
     }
   }
 
-  const nativecpu_entry *nativecpu_it =
-      reinterpret_cast<const nativecpu_entry *>(pBinary);
+  const nativecpu_program *program =
+      reinterpret_cast<const nativecpu_program *>(pBinary);
+  const nativecpu_entry *nativecpu_it = program->entries;
   while (nativecpu_it->kernel_ptr != nullptr) {
     hProgram->_kernels.insert(
         std::make_pair(nativecpu_it->kernelname, nativecpu_it->kernel_ptr));
     nativecpu_it++;
   }
 
+  // Process Native CPU specific properties
+  const _pi_device_binary_property_set_struct *props = program->properties;
+  for (auto prop = props->PropertiesBegin; prop != props->PropertiesEnd;
+       prop++) {
+    auto [Prefix, Tag] = splitMetadataName(prop->Name);
+    if (Tag == "@is_nd_range") {
+      hProgram->KernelIsNDRangeMD[Prefix] = prop->ValSize;
+    }
+  }
+
   *phProgram = hProgram.release();
 
   return UR_RESULT_SUCCESS;

@@ -41,13 +41,40 @@ struct ur_program_handle_t_ : RefCounted {
   std::unordered_map<std::string, native_cpu::WGSize_t>
       KernelMaxWorkGroupSizeMD;
   std::unordered_map<std::string, uint64_t> KernelMaxLinearWorkGroupSizeMD;
+  std::unordered_map<std::string, bool> KernelIsNDRangeMD;
 };
 
-// The nativecpu_entry struct is also defined as LLVM-IR in the
+// These structs are also defined as LLVM-IR in the
 // clang-offload-wrapper tool. The two definitions need to match,
 // therefore any change to this struct needs to be reflected in the
 // offload-wrapper.
+
 struct nativecpu_entry {
   const char *kernelname;
   const unsigned char *kernel_ptr;
 };
+
+typedef enum {
+  PI_PROPERTY_TYPE_INT32,
+  PI_PROPERTY_TYPE_STRING
+} pi_property_type;
+
+struct _pi_device_binary_property_struct {
+  char *Name;
+  void *ValAddr;
+  pi_property_type Type;
+  uint64_t ValSize;
+};
+
+// TODO These property structs are taken from clang-offload-wrapper,
+// perhaps we could define something that fits better our purposes?
+struct _pi_device_binary_property_set_struct {
+  char *Name;
+  _pi_device_binary_property_struct *PropertiesBegin;
+  _pi_device_binary_property_struct *PropertiesEnd;
+};
+
+struct nativecpu_program {
+  nativecpu_entry *entries;
+  _pi_device_binary_property_set_struct *properties;
+};