From 93f995ba8aa0e88f6763c9b335edc44d14f864f0 Mon Sep 17 00:00:00 2001 From: PietroGhg Date: Fri, 11 Oct 2024 15:23:59 +0100 Subject: [PATCH] Process Native CPU only properties --- source/adapters/native_cpu/enqueue.cpp | 15 ++++++------- source/adapters/native_cpu/kernel.cpp | 7 ++++++- source/adapters/native_cpu/kernel.hpp | 14 ++++++------- source/adapters/native_cpu/program.cpp | 16 ++++++++++++-- source/adapters/native_cpu/program.hpp | 29 +++++++++++++++++++++++++- 5 files changed, 61 insertions(+), 20 deletions(-) diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp index 6e4094ddef..f22eb80b78 100644 --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -138,13 +138,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( #else bool isLocalSizeOne = ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1; - if (isLocalSizeOne && ndr.GlobalSize[0] > numParallelThreads) { - // If the local size is one, we make the assumption that we are running a - // parallel_for over a sycl::range. - // Todo: we could add compiler checks and - // kernel properties for this (e.g. check that no barriers are called, no - // local memory args). - + if (isLocalSizeOne && !hKernel->isNDRangeKernel()) { // Todo: this assumes that dim 0 is the best dimension over which we want to // parallelize @@ -153,8 +147,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // divide the global range by the number of threads, set that as the local // size and peel everything else. - size_t new_num_work_groups_0 = numParallelThreads; - size_t itemsPerThread = ndr.GlobalSize[0] / numParallelThreads; + size_t new_num_work_groups_0 = + ndr.GlobalSize[0] > numParallelThreads ? numParallelThreads : 1; + size_t itemsPerThread = ndr.GlobalSize[0] > numParallelThreads + ? ndr.GlobalSize[0] / numParallelThreads + : ndr.GlobalSize[0]; for (unsigned g2 = 0; g2 < numWG2; g2++) { for (unsigned g1 = 0; g1 < numWG1; g1++) { diff --git a/source/adapters/native_cpu/kernel.cpp b/source/adapters/native_cpu/kernel.cpp index 596a3ffdf1..5f6e17b577 100644 --- a/source/adapters/native_cpu/kernel.cpp +++ b/source/adapters/native_cpu/kernel.cpp @@ -47,8 +47,13 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName, if (auto MaxLIt = MaxLinMap.find(pKernelName); MaxLIt != MaxLinMap.end()) { MaxLinearWG = MaxLIt->second; } + bool IsNDRangeKernel = false; + if (auto isNDIt = hProgram->KernelIsNDRangeMD.find(pKernelName); + isNDIt != hProgram->KernelIsNDRangeMD.end()) { + IsNDRangeKernel = isNDIt->second; + } kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f, ReqdWG, MaxWG, - MaxLinearWG); + MaxLinearWG, IsNDRangeKernel); *phKernel = kernel; diff --git a/source/adapters/native_cpu/kernel.hpp b/source/adapters/native_cpu/kernel.hpp index e2df672d05..995c628c5d 100644 --- a/source/adapters/native_cpu/kernel.hpp +++ b/source/adapters/native_cpu/kernel.hpp @@ -28,16 +28,12 @@ struct local_arg_info_t { struct ur_kernel_handle_t_ : RefCounted { - ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name, - nativecpu_task_t subhandler) - : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)} {} - ur_kernel_handle_t_(const ur_kernel_handle_t_ &other) : Args(other.Args), hProgram(other.hProgram), _name(other._name), _subhandler(other._subhandler), _localArgInfo(other._localArgInfo), _localMemPool(other._localMemPool), _localMemPoolSize(other._localMemPoolSize), - ReqdWGSize(other.ReqdWGSize) { + ReqdWGSize(other.ReqdWGSize), NDRangeKernel(other.NDRangeKernel) { incrementReferenceCount(); } @@ -52,10 +48,11 @@ struct ur_kernel_handle_t_ : RefCounted { nativecpu_task_t subhandler, std::optional ReqdWGSize, std::optional MaxWGSize, - std::optional MaxLinearWGSize) + std::optional MaxLinearWGSize, + bool isNDRangeKernel) : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)}, ReqdWGSize(ReqdWGSize), MaxWGSize(MaxWGSize), - MaxLinearWGSize(MaxLinearWGSize) {} + MaxLinearWGSize(MaxLinearWGSize), NDRangeKernel(isNDRangeKernel) {} struct arguments { using args_index_t = std::vector; @@ -162,10 +159,13 @@ struct ur_kernel_handle_t_ : RefCounted { void addPtrArg(void *Ptr, size_t Index) { Args.addPtrArg(Index, Ptr); } + bool isNDRangeKernel() const { return NDRangeKernel; } + private: char *_localMemPool = nullptr; size_t _localMemPoolSize = 0; std::optional ReqdWGSize = std::nullopt; std::optional MaxWGSize = std::nullopt; std::optional MaxLinearWGSize = std::nullopt; + const bool NDRangeKernel = false; }; diff --git a/source/adapters/native_cpu/program.cpp b/source/adapters/native_cpu/program.cpp index bc7baeb387..36240b2593 100644 --- a/source/adapters/native_cpu/program.cpp +++ b/source/adapters/native_cpu/program.cpp @@ -8,6 +8,7 @@ // //===----------------------------------------------------------------------===// +#include "ur/ur.hpp" #include "ur_api.h" #include "common.hpp" @@ -99,14 +100,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( } } - const nativecpu_entry *nativecpu_it = - reinterpret_cast(pBinary); + const nativecpu_program *program = + reinterpret_cast(pBinary); + const nativecpu_entry *nativecpu_it = program->entries; while (nativecpu_it->kernel_ptr != nullptr) { hProgram->_kernels.insert( std::make_pair(nativecpu_it->kernelname, nativecpu_it->kernel_ptr)); nativecpu_it++; } + // Process Native CPU specific properties + const _pi_device_binary_property_set_struct *props = program->properties; + for (auto prop = props->PropertiesBegin; prop != props->PropertiesEnd; + prop++) { + auto [Prefix, Tag] = splitMetadataName(prop->Name); + if (Tag == "@is_nd_range") { + hProgram->KernelIsNDRangeMD[Prefix] = prop->ValSize; + } + } + *phProgram = hProgram.release(); return UR_RESULT_SUCCESS; diff --git a/source/adapters/native_cpu/program.hpp b/source/adapters/native_cpu/program.hpp index d58412751e..c76f97d561 100644 --- a/source/adapters/native_cpu/program.hpp +++ b/source/adapters/native_cpu/program.hpp @@ -41,13 +41,40 @@ struct ur_program_handle_t_ : RefCounted { std::unordered_map KernelMaxWorkGroupSizeMD; std::unordered_map KernelMaxLinearWorkGroupSizeMD; + std::unordered_map KernelIsNDRangeMD; }; -// The nativecpu_entry struct is also defined as LLVM-IR in the +// These structs are also defined as LLVM-IR in the // clang-offload-wrapper tool. The two definitions need to match, // therefore any change to this struct needs to be reflected in the // offload-wrapper. + struct nativecpu_entry { const char *kernelname; const unsigned char *kernel_ptr; }; + +typedef enum { + PI_PROPERTY_TYPE_INT32, + PI_PROPERTY_TYPE_STRING +} pi_property_type; + +struct _pi_device_binary_property_struct { + char *Name; + void *ValAddr; + pi_property_type Type; + uint64_t ValSize; +}; + +// TODO These property structs are taken from clang-offload-wrapper, +// perhaps we could define something that fits better our purposes? +struct _pi_device_binary_property_set_struct { + char *Name; + _pi_device_binary_property_struct *PropertiesBegin; + _pi_device_binary_property_struct *PropertiesEnd; +}; + +struct nativecpu_program { + nativecpu_entry *entries; + _pi_device_binary_property_set_struct *properties; +};