Skip to content

Commit

Permalink
Process Native CPU only properties
Browse files Browse the repository at this point in the history
  • Loading branch information
PietroGhg committed Nov 29, 2024
1 parent 0c60db6 commit 8b9583c
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 20 deletions.
15 changes: 6 additions & 9 deletions source/adapters/native_cpu/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,13 +138,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
#else
bool isLocalSizeOne =
ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1;
if (isLocalSizeOne && ndr.GlobalSize[0] > numParallelThreads) {
// If the local size is one, we make the assumption that we are running a
// parallel_for over a sycl::range.
// Todo: we could add compiler checks and
// kernel properties for this (e.g. check that no barriers are called, no
// local memory args).

if (isLocalSizeOne && !hKernel->isNDRangeKernel()) {
// Todo: this assumes that dim 0 is the best dimension over which we want to
// parallelize

Expand All @@ -153,8 +147,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
// divide the global range by the number of threads, set that as the local
// size and peel everything else.

size_t new_num_work_groups_0 = numParallelThreads;
size_t itemsPerThread = ndr.GlobalSize[0] / numParallelThreads;
size_t new_num_work_groups_0 =
ndr.GlobalSize[0] > numParallelThreads ? numParallelThreads : 1;
size_t itemsPerThread = ndr.GlobalSize[0] > numParallelThreads
? ndr.GlobalSize[0] / numParallelThreads
: ndr.GlobalSize[0];

for (unsigned g2 = 0; g2 < numWG2; g2++) {
for (unsigned g1 = 0; g1 < numWG1; g1++) {
Expand Down
7 changes: 6 additions & 1 deletion source/adapters/native_cpu/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,13 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName,
if (auto MaxLIt = MaxLinMap.find(pKernelName); MaxLIt != MaxLinMap.end()) {
MaxLinearWG = MaxLIt->second;
}
bool IsNDRangeKernel = false;
if (auto isNDIt = hProgram->KernelIsNDRangeMD.find(pKernelName);
isNDIt != hProgram->KernelIsNDRangeMD.end()) {
IsNDRangeKernel = isNDIt->second;
}
kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f, ReqdWG, MaxWG,
MaxLinearWG);
MaxLinearWG, IsNDRangeKernel);

*phKernel = kernel;

Expand Down
14 changes: 7 additions & 7 deletions source/adapters/native_cpu/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,12 @@ struct local_arg_info_t {

struct ur_kernel_handle_t_ : RefCounted {

ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
nativecpu_task_t subhandler)
: hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)} {}

ur_kernel_handle_t_(const ur_kernel_handle_t_ &other)
: Args(other.Args), hProgram(other.hProgram), _name(other._name),
_subhandler(other._subhandler), _localArgInfo(other._localArgInfo),
_localMemPool(other._localMemPool),
_localMemPoolSize(other._localMemPoolSize),
ReqdWGSize(other.ReqdWGSize) {
ReqdWGSize(other.ReqdWGSize), NDRangeKernel(other.NDRangeKernel) {
incrementReferenceCount();
}

Expand All @@ -52,10 +48,11 @@ struct ur_kernel_handle_t_ : RefCounted {
nativecpu_task_t subhandler,
std::optional<native_cpu::WGSize_t> ReqdWGSize,
std::optional<native_cpu::WGSize_t> MaxWGSize,
std::optional<uint64_t> MaxLinearWGSize)
std::optional<uint64_t> MaxLinearWGSize,
bool isNDRangeKernel)
: hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)},
ReqdWGSize(ReqdWGSize), MaxWGSize(MaxWGSize),
MaxLinearWGSize(MaxLinearWGSize) {}
MaxLinearWGSize(MaxLinearWGSize), NDRangeKernel(isNDRangeKernel) {}

struct arguments {
using args_index_t = std::vector<void *>;
Expand Down Expand Up @@ -162,10 +159,13 @@ struct ur_kernel_handle_t_ : RefCounted {

void addPtrArg(void *Ptr, size_t Index) { Args.addPtrArg(Index, Ptr); }

bool isNDRangeKernel() const { return NDRangeKernel; }

private:
char *_localMemPool = nullptr;
size_t _localMemPoolSize = 0;
std::optional<native_cpu::WGSize_t> ReqdWGSize = std::nullopt;
std::optional<native_cpu::WGSize_t> MaxWGSize = std::nullopt;
std::optional<uint64_t> MaxLinearWGSize = std::nullopt;
const bool NDRangeKernel = false;
};
16 changes: 14 additions & 2 deletions source/adapters/native_cpu/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
//
//===----------------------------------------------------------------------===//

#include "ur/ur.hpp"
#include "ur_api.h"

#include "common.hpp"
Expand Down Expand Up @@ -99,14 +100,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
}
}

const nativecpu_entry *nativecpu_it =
reinterpret_cast<const nativecpu_entry *>(pBinary);
const nativecpu_program *program =
reinterpret_cast<const nativecpu_program *>(pBinary);
const nativecpu_entry *nativecpu_it = program->entries;
while (nativecpu_it->kernel_ptr != nullptr) {
hProgram->_kernels.insert(
std::make_pair(nativecpu_it->kernelname, nativecpu_it->kernel_ptr));
nativecpu_it++;
}

// Process Native CPU specific properties
const _pi_device_binary_property_set_struct *props = program->properties;
for (auto prop = props->PropertiesBegin; prop != props->PropertiesEnd;
prop++) {
auto [Prefix, Tag] = splitMetadataName(prop->Name);
if (Tag == "@is_nd_range") {
hProgram->KernelIsNDRangeMD[Prefix] = prop->ValSize;
}
}

*phProgram = hProgram.release();

return UR_RESULT_SUCCESS;
Expand Down
29 changes: 28 additions & 1 deletion source/adapters/native_cpu/program.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,40 @@ struct ur_program_handle_t_ : RefCounted {
std::unordered_map<std::string, native_cpu::WGSize_t>
KernelMaxWorkGroupSizeMD;
std::unordered_map<std::string, uint64_t> KernelMaxLinearWorkGroupSizeMD;
std::unordered_map<std::string, bool> KernelIsNDRangeMD;
};

// The nativecpu_entry struct is also defined as LLVM-IR in the
// These structs are also defined as LLVM-IR in the
// clang-offload-wrapper tool. The two definitions need to match,
// therefore any change to this struct needs to be reflected in the
// offload-wrapper.

struct nativecpu_entry {
const char *kernelname;
const unsigned char *kernel_ptr;
};

typedef enum {
PI_PROPERTY_TYPE_INT32,
PI_PROPERTY_TYPE_STRING
} pi_property_type;

struct _pi_device_binary_property_struct {
char *Name;
void *ValAddr;
pi_property_type Type;
uint64_t ValSize;
};

// TODO These property structs are taken from clang-offload-wrapper,
// perhaps we could define something that fits better our purposes?
struct _pi_device_binary_property_set_struct {
char *Name;
_pi_device_binary_property_struct *PropertiesBegin;
_pi_device_binary_property_struct *PropertiesEnd;
};

struct nativecpu_program {
nativecpu_entry *entries;
_pi_device_binary_property_set_struct *properties;
};

0 comments on commit 8b9583c

Please sign in to comment.