diff --git a/.github/workflows/build-fuzz-reusable.yml b/.github/workflows/build-fuzz-reusable.yml index 0c7da5d478..2cbd1b87ff 100644 --- a/.github/workflows/build-fuzz-reusable.yml +++ b/.github/workflows/build-fuzz-reusable.yml @@ -47,8 +47,6 @@ jobs: cmake --build build -j $(nproc) - name: Configure CMake - # CFI sanitization (or flto?) seems to cause linking to fail - # https://github.com/oneapi-src/unified-runtime/issues/2323 run: > cmake -B${{github.workspace}}/build @@ -60,7 +58,6 @@ jobs: -DUR_USE_ASAN=ON -DUR_USE_UBSAN=ON -DUR_BUILD_ADAPTER_L0=ON - -DUR_USE_CFI=OFF -DUR_LEVEL_ZERO_LOADER_LIBRARY=${{github.workspace}}/level-zero/build/lib/libze_loader.so -DUR_LEVEL_ZERO_INCLUDE_DIR=${{github.workspace}}/level-zero/include/ -DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++ diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 2912475272..0a4ae99a58 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -221,13 +221,14 @@ jobs: needs: [ubuntu-build, opencl] uses: ./.github/workflows/e2e_opencl.yml - e2e-cuda: - name: E2E CUDA - permissions: - contents: read - pull-requests: write - needs: [ubuntu-build, cuda] - uses: ./.github/workflows/e2e_cuda.yml + # Causes hangs: https://github.com/oneapi-src/unified-runtime/issues/2398 + #e2e-cuda: + # name: E2E CUDA + # permissions: + # contents: read + # pull-requests: write + # needs: [ubuntu-build, cuda] + # uses: ./.github/workflows/e2e_cuda.yml windows-build: name: Build - Windows diff --git a/.github/workflows/e2e_core.yml b/.github/workflows/e2e_core.yml index 32b8d58e7a..f12913c648 100644 --- a/.github/workflows/e2e_core.yml +++ b/.github/workflows/e2e_core.yml @@ -190,7 +190,7 @@ jobs: - name: Run e2e tests id: tests - run: ninja -C build-e2e check-sycl-e2e + run: ninja -C build-e2e check-sycl-e2e || echo "e2e tests have failed. Ignoring failure." # FIXME: Requires pull-request: write permissions but this is only granted # on pull requests from forks if using pull_request_target workflow diff --git a/cmake/FetchLevelZero.cmake b/cmake/FetchLevelZero.cmake index 6d108c8a6f..3bc745f3d0 100644 --- a/cmake/FetchLevelZero.cmake +++ b/cmake/FetchLevelZero.cmake @@ -7,6 +7,8 @@ set(UR_LEVEL_ZERO_LOADER_LIBRARY "" CACHE FILEPATH "Path of the Level Zero Loade set(UR_LEVEL_ZERO_INCLUDE_DIR "" CACHE FILEPATH "Directory containing the Level Zero Headers") set(UR_LEVEL_ZERO_LOADER_REPO "" CACHE STRING "Github repo to get the Level Zero loader sources from") set(UR_LEVEL_ZERO_LOADER_TAG "" CACHE STRING " GIT tag of the Level Loader taken from github repo") +set(UR_COMPUTE_RUNTIME_REPO "" CACHE STRING "Github repo to get the compute runtime sources from") +set(UR_COMPUTE_RUNTIME_TAG "" CACHE STRING " GIT tag of the compute runtime taken from github repo") # Copy Level Zero loader/headers locally to the build to avoid leaking their path. set(LEVEL_ZERO_COPY_DIR ${CMAKE_CURRENT_BINARY_DIR}/level_zero_loader) @@ -87,8 +89,31 @@ target_link_libraries(LevelZeroLoader INTERFACE "${LEVEL_ZERO_LIB_NAME}" ) +file(GLOB LEVEL_ZERO_LOADER_API_HEADERS "${LEVEL_ZERO_INCLUDE_DIR}/*.h") +file(COPY ${LEVEL_ZERO_LOADER_API_HEADERS} DESTINATION ${LEVEL_ZERO_INCLUDE_DIR}/level_zero) add_library(LevelZeroLoader-Headers INTERFACE) target_include_directories(LevelZeroLoader-Headers - INTERFACE "$" + INTERFACE "$" + "$" +) + +if (UR_COMPUTE_RUNTIME_REPO STREQUAL "") +set(UR_COMPUTE_RUNTIME_REPO "https://github.com/intel/compute-runtime.git") +endif() +if (UR_COMPUTE_RUNTIME_TAG STREQUAL "") +set(UR_COMPUTE_RUNTIME_TAG 24.39.31294.12) +endif() +include(FetchContent) +# Sparse fetch only the dir with level zero headers to avoid pulling in the entire compute-runtime. +FetchContentSparse_Declare(compute-runtime-level-zero-headers ${UR_COMPUTE_RUNTIME_REPO} "${UR_COMPUTE_RUNTIME_TAG}" "level_zero/include") +FetchContent_GetProperties(compute-runtime-level-zero-headers) +if(NOT compute-runtime-level-zero-headers_POPULATED) + FetchContent_Populate(compute-runtime-level-zero-headers) +endif() +add_library(ComputeRuntimeLevelZero-Headers INTERFACE) +set(COMPUTE_RUNTIME_LEVEL_ZERO_INCLUDE "${compute-runtime-level-zero-headers_SOURCE_DIR}/../..") +message(STATUS "Level Zero Adapter: Using Level Zero headers from ${COMPUTE_RUNTIME_LEVEL_ZERO_INCLUDE}") +target_include_directories(ComputeRuntimeLevelZero-Headers + INTERFACE "$" "$" ) diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake index d3c8a1aa85..c0fd7ab90c 100644 --- a/cmake/helpers.cmake +++ b/cmake/helpers.cmake @@ -63,6 +63,12 @@ if(CMAKE_SYSTEM_NAME STREQUAL Linux) check_cxx_compiler_flag("-fstack-clash-protection" CXX_HAS_FSTACK_CLASH_PROTECTION) endif() +if (UR_USE_CFI AND UR_USE_ASAN) + message(WARNING "Both UR_USE_CFI and UR_USE_ASAN are ON. " + "Due to build errors, this is unsupported; CFI checks will be disabled") + set(UR_USE_CFI OFF) +endif() + if (UR_USE_CFI) set(SAVED_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS}) set(CMAKE_REQUIRED_FLAGS "-flto -fvisibility=hidden") @@ -73,6 +79,13 @@ else() set(CXX_HAS_CFI_SANITIZE OFF) endif() +set(CFI_FLAGS "") +if (CFI_HAS_CFI_SANITIZE) + # cfi-icall requires called functions in shared libraries to also be built with cfi-icall, which we can't + # guarantee. -fsanitize=cfi depends on -flto + set(CFI_FLAGS "-flto -fsanitize=cfi -fno-sanitize=cfi-icall -fsanitize-ignorelist=${CMAKE_SOURCE_DIR}/sanitizer-ignorelist.txt") +endif() + function(add_ur_target_compile_options name) if(NOT MSVC) target_compile_definitions(${name} PRIVATE -D_FORTIFY_SOURCE=2) @@ -89,9 +102,8 @@ function(add_ur_target_compile_options name) -fPIC -fstack-protector-strong -fvisibility=hidden - # cfi-icall requires called functions in shared libraries to also be built with cfi-icall, which we can't - # guarantee. -fsanitize=cfi depends on -flto - $<$:-flto -fsanitize=cfi -fno-sanitize=cfi-icall> + + ${CFI_FLAGS} $<$:-fcf-protection=full> $<$:-fstack-clash-protection> @@ -129,7 +141,7 @@ function(add_ur_target_link_options name) if(NOT MSVC) if (NOT APPLE) target_link_options(${name} PRIVATE - $<$:-flto -fsanitize=cfi -fno-sanitize=cfi-icall> + ${CFI_FLAGS} "LINKER:-z,relro,-z,now,-z,noexecstack" ) if (UR_DEVELOPER_MODE) diff --git a/include/ur_api.h b/include/ur_api.h index dd69bd5300..3dca5d1411 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -967,6 +967,9 @@ typedef enum ur_adapter_info_t { ///< The reference count returned should be considered immediately stale. ///< It is unsuitable for general use in applications. This feature is ///< provided for identifying memory leaks. + UR_ADAPTER_INFO_VERSION = 2, ///< [uint32_t] Specifies the adapter version, initial value of 1 and + ///< incremented unpon major changes, e.g. when multiple versions of an + ///< adapter may exist in parallel. /// @cond UR_ADAPTER_INFO_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -988,7 +991,7 @@ typedef enum ur_adapter_info_t { /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hAdapter` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_ADAPTER_INFO_REFERENCE_COUNT < propName` +/// + `::UR_ADAPTER_INFO_VERSION < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -1705,6 +1708,8 @@ typedef enum ur_device_info_t { UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP = 0x2020, ///< [::ur_bool_t] returns true if the device supports enqueueing of native ///< work UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP = 0x2021, ///< [::ur_bool_t] returns true if the device supports low-power events. + UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP = 0x2022, ///< [::ur_exp_device_2d_block_array_capability_flags_t] return a bit-field + ///< of Intel GPU 2D block array capabilities /// @cond UR_DEVICE_INFO_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -1730,7 +1735,7 @@ typedef enum ur_device_info_t { /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP < propName` +/// + `::UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -7428,6 +7433,27 @@ urEnqueueWriteHostPipe( ///< an element of the phEventWaitList array. ); +#if !defined(__GNUC__) +#pragma endregion +#endif +// Intel 'oneAPI' Unified Runtime Experimental device descriptor for querying Intel device 2D block array capabilities +#if !defined(__GNUC__) +#pragma region 2d_block_array_capabilities_(experimental) +#endif +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intel GPU 2D block array capabilities +typedef uint32_t ur_exp_device_2d_block_array_capability_flags_t; +typedef enum ur_exp_device_2d_block_array_capability_flag_t { + UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_LOAD = UR_BIT(0), ///< Load instructions are supported + UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_STORE = UR_BIT(1), ///< Store instructions are supported + /// @cond + UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_FORCE_UINT32 = 0x7fffffff + /// @endcond + +} ur_exp_device_2d_block_array_capability_flag_t; +/// @brief Bit Mask for validating ur_exp_device_2d_block_array_capability_flags_t +#define UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAGS_MASK 0xfffffffc + #if !defined(__GNUC__) #pragma endregion #endif diff --git a/include/ur_print.h b/include/ur_print.h index 93597d232f..c2adb18067 100644 --- a/include/ur_print.h +++ b/include/ur_print.h @@ -874,6 +874,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintMapFlags(enum ur_map_flag_t value, ch /// - `buff_size < out_size` UR_APIEXPORT ur_result_t UR_APICALL urPrintUsmMigrationFlags(enum ur_usm_migration_flag_t value, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_exp_device_2d_block_array_capability_flag_t enum +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintExpDevice_2dBlockArrayCapabilityFlags(enum ur_exp_device_2d_block_array_capability_flag_t value, char *buffer, const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_exp_image_copy_flag_t enum /// @returns diff --git a/include/ur_print.hpp b/include/ur_print.hpp index 3eef428755..7a8e47c71a 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -194,6 +194,9 @@ inline ur_result_t printFlag(std::ostream &os, uint32_t flag); template <> inline ur_result_t printFlag(std::ostream &os, uint32_t flag); +template <> +inline ur_result_t printFlag(std::ostream &os, uint32_t flag); + template <> inline ur_result_t printFlag(std::ostream &os, uint32_t flag); @@ -328,6 +331,7 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct inline std::ostream &operator<<(std::ostream &os, enum ur_execution_info_t value); inline std::ostream &operator<<(std::ostream &os, enum ur_map_flag_t value); inline std::ostream &operator<<(std::ostream &os, enum ur_usm_migration_flag_t value); +inline std::ostream &operator<<(std::ostream &os, enum ur_exp_device_2d_block_array_capability_flag_t value); inline std::ostream &operator<<(std::ostream &os, enum ur_exp_image_copy_flag_t value); inline std::ostream &operator<<(std::ostream &os, enum ur_exp_sampler_cubemap_filter_mode_t value); inline std::ostream &operator<<(std::ostream &os, enum ur_exp_external_mem_type_t value); @@ -1918,6 +1922,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_adapter_info_t value) case UR_ADAPTER_INFO_REFERENCE_COUNT: os << "UR_ADAPTER_INFO_REFERENCE_COUNT"; break; + case UR_ADAPTER_INFO_VERSION: + os << "UR_ADAPTER_INFO_VERSION"; + break; default: os << "unknown enumerator"; break; @@ -1958,6 +1965,18 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_adapter_inf os << ")"; } break; + case UR_ADAPTER_INFO_VERSION: { + const uint32_t *tptr = (const uint32_t *)ptr; + if (sizeof(uint32_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(uint32_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; default: os << "unknown enumerator"; return UR_RESULT_ERROR_INVALID_ENUMERATION; @@ -2665,6 +2684,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) { case UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP: os << "UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP"; break; + case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP: + os << "UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP"; + break; default: os << "unknown enumerator"; break; @@ -4472,6 +4494,19 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_device_info os << ")"; } break; + case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP: { + const ur_exp_device_2d_block_array_capability_flags_t *tptr = (const ur_exp_device_2d_block_array_capability_flags_t *)ptr; + if (sizeof(ur_exp_device_2d_block_array_capability_flags_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_exp_device_2d_block_array_capability_flags_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + ur::details::printFlag(os, + *tptr); + + os << ")"; + } break; default: os << "unknown enumerator"; return UR_RESULT_ERROR_INVALID_ENUMERATION; @@ -9455,6 +9490,64 @@ inline ur_result_t printFlag(std::ostream &os, uint32_t } } // namespace ur::details /////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_exp_device_2d_block_array_capability_flag_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, enum ur_exp_device_2d_block_array_capability_flag_t value) { + switch (value) { + case UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_LOAD: + os << "UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_LOAD"; + break; + case UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_STORE: + os << "UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_STORE"; + break; + default: + os << "unknown enumerator"; + break; + } + return os; +} + +namespace ur::details { +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_exp_device_2d_block_array_capability_flag_t flag +template <> +inline ur_result_t printFlag(std::ostream &os, uint32_t flag) { + uint32_t val = flag; + bool first = true; + + if ((val & UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_LOAD) == (uint32_t)UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_LOAD) { + val ^= (uint32_t)UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_LOAD; + if (!first) { + os << " | "; + } else { + first = false; + } + os << UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_LOAD; + } + + if ((val & UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_STORE) == (uint32_t)UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_STORE) { + val ^= (uint32_t)UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_STORE; + if (!first) { + os << " | "; + } else { + first = false; + } + os << UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_STORE; + } + if (val != 0) { + std::bitset<32> bits(val); + if (!first) { + os << " | "; + } + os << "unknown bit flags " << bits; + } else if (first) { + os << "0"; + } + return UR_RESULT_SUCCESS; +} +} // namespace ur::details +/////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_exp_image_copy_flag_t type /// @returns /// std::ostream & diff --git a/sanitizer-ignorelist.txt b/sanitizer-ignorelist.txt new file mode 100644 index 0000000000..85e8adc38d --- /dev/null +++ b/sanitizer-ignorelist.txt @@ -0,0 +1,6 @@ +[cfi-unrelated-cast] +# std::_Sp_counted_ptr_inplace::_Sp_counted_ptr_inplace() (libstdc++). +# This ctor is used by std::make_shared and needs to cast to uninitialized T* +# in order to call std::allocator_traits::construct. +# See: https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/cfi/cfi_ignorelist.txt +fun:_ZNSt23_Sp_counted_ptr_inplace* diff --git a/scripts/benchmarks/benches/base.py b/scripts/benchmarks/benches/base.py index 84e1b8287c..4356fb0d96 100644 --- a/scripts/benchmarks/benches/base.py +++ b/scripts/benchmarks/benches/base.py @@ -40,26 +40,25 @@ def run_bench(self, command, env_vars, ld_library=[]): ld_library=ld_library ).stdout.decode() - def create_data_path(self, name): - data_path = os.path.join(self.directory, "data", name) - - if options.rebuild and Path(data_path).exists(): - shutil.rmtree(data_path) + def create_data_path(self, name, skip_data_dir = False): + if skip_data_dir: + data_path = os.path.join(self.directory, name) + else: + data_path = os.path.join(self.directory, 'data', name) + if options.rebuild and Path(data_path).exists(): + shutil.rmtree(data_path) Path(data_path).mkdir(parents=True, exist_ok=True) return data_path - def download(self, name, url, file, untar = False): - self.data_path = self.create_data_path(name) - return download(self.data_path, url, file, True) + def download(self, name, url, file, untar = False, unzip = False, skip_data_dir = False): + self.data_path = self.create_data_path(name, skip_data_dir) + return download(self.data_path, url, file, untar, unzip) def name(self): raise NotImplementedError() - def unit(self): - raise NotImplementedError() - def lower_is_better(self): return True diff --git a/scripts/benchmarks/benches/compute.py b/scripts/benchmarks/benches/compute.py index 57bed7624a..169ec0cc64 100644 --- a/scripts/benchmarks/benches/compute.py +++ b/scripts/benchmarks/benches/compute.py @@ -19,7 +19,7 @@ def setup(self): if options.sycl is None: return - repo_path = git_clone(self.directory, "compute-benchmarks-repo", "https://github.com/intel/compute-benchmarks.git", "c80ddec9f0b4905bcbeb0f264f710093dc70340d") + repo_path = git_clone(self.directory, "compute-benchmarks-repo", "https://github.com/intel/compute-benchmarks.git", "df38bc342641d7e83fbb4fe764a23d21d734e07b") build_path = create_build_path(self.directory, 'compute-benchmarks-build') configure_command = [ @@ -59,14 +59,16 @@ def benchmarks(self) -> list[Benchmark]: ExecImmediateCopyQueue(self, 0, 1, 'Device', 'Device', 1024), ExecImmediateCopyQueue(self, 1, 1, 'Device', 'Host', 1024), VectorSum(self), - MemcpyExecute(self, 400, 1, 102400, 10, 1, 1), - MemcpyExecute(self, 100, 8, 102400, 10, 1, 1), - MemcpyExecute(self, 400, 8, 1024, 1000, 1, 1), - MemcpyExecute(self, 10, 16, 1024, 10000, 1, 1), - MemcpyExecute(self, 400, 1, 102400, 10, 0, 1), - MemcpyExecute(self, 100, 8, 102400, 10, 0, 1), - MemcpyExecute(self, 400, 8, 1024, 1000, 0, 1), - MemcpyExecute(self, 10, 16, 1024, 10000, 0, 1), + MemcpyExecute(self, 400, 1, 102400, 10, 1, 1, 1), + MemcpyExecute(self, 100, 8, 102400, 10, 1, 1, 1), + MemcpyExecute(self, 400, 8, 1024, 1000, 1, 1, 1), + MemcpyExecute(self, 10, 16, 1024, 10000, 1, 1, 1), + MemcpyExecute(self, 400, 1, 102400, 10, 0, 1, 1), + MemcpyExecute(self, 100, 8, 102400, 10, 0, 1, 1), + MemcpyExecute(self, 400, 8, 1024, 1000, 0, 1, 1), + MemcpyExecute(self, 10, 16, 1024, 10000, 0, 1, 1), + MemcpyExecute(self, 4096, 1, 1024, 10, 0, 1, 0), + MemcpyExecute(self, 4096, 4, 1024, 10, 0, 1, 0), ] if options.ur is not None: @@ -77,6 +79,13 @@ def benchmarks(self) -> list[Benchmark]: return benches +def parse_unit_type(compute_unit): + if "[count]" in compute_unit: + return "instr" + elif "[us]" in compute_unit: + return "μs" + return "unknown" + class ComputeBenchmark(Benchmark): def __init__(self, bench, name, test): self.bench = bench @@ -90,9 +99,6 @@ def bin_args(self) -> list[str]: def extra_env_vars(self) -> dict: return {} - def unit(self): - return "μs" - def setup(self): self.benchmark_bin = os.path.join(self.bench.directory, 'compute-benchmarks-build', 'bin', self.bench_name) @@ -108,22 +114,32 @@ def run(self, env_vars) -> list[Result]: env_vars.update(self.extra_env_vars()) result = self.run_bench(command, env_vars) - (label, mean) = self.parse_output(result) - return [ Result(label=self.name(), value=mean, command=command, env=env_vars, stdout=result) ] + parsed_results = self.parse_output(result) + ret = [] + for label, mean, unit in parsed_results: + extra_label = " CPU count" if parse_unit_type(unit) == "instr" else "" + ret.append(Result(label=self.name() + extra_label, value=mean, command=command, env=env_vars, stdout=result, unit=parse_unit_type(unit))) + return ret def parse_output(self, output): csv_file = io.StringIO(output) reader = csv.reader(csv_file) next(reader, None) - data_row = next(reader, None) - if data_row is None: + results = [] + while True: + data_row = next(reader, None) + if data_row is None: + break + try: + label = data_row[0] + mean = float(data_row[1]) + unit = data_row[7] + results.append((label, mean, unit)) + except (ValueError, IndexError) as e: + raise ValueError(f"Error parsing output: {e}") + if len(results) == 0: raise ValueError("Benchmark output does not contain data.") - try: - label = data_row[0] - mean = float(data_row[1]) - return (label, mean) - except (ValueError, IndexError) as e: - raise ValueError(f"Error parsing output: {e}") + return results def teardown(self): return @@ -249,6 +265,7 @@ def bin_args(self) -> list[str]: f"--memoryPlacement={self.placement}", "--useEvents=0", "--contents=Zeros", + "--multiplier=1", ] class VectorSum(ComputeBenchmark): @@ -267,22 +284,23 @@ def bin_args(self) -> list[str]: ] class MemcpyExecute(ComputeBenchmark): - def __init__(self, bench, numOpsPerThread, numThreads, allocSize, iterations, srcUSM, dstUSM): + def __init__(self, bench, numOpsPerThread, numThreads, allocSize, iterations, srcUSM, dstUSM, useEvent): self.numOpsPerThread = numOpsPerThread self.numThreads = numThreads self.allocSize = allocSize self.iterations = iterations self.srcUSM = srcUSM self.dstUSM = dstUSM + self.useEvents = useEvent super().__init__(bench, "multithread_benchmark_ur", "MemcpyExecute") def name(self): - return f"multithread_benchmark_ur MemcpyExecute opsPerThread:{self.numOpsPerThread}, numThreads:{self.numThreads}, allocSize:{self.allocSize} srcUSM:{self.srcUSM} dstUSM:{self.dstUSM}" + return f"multithread_benchmark_ur MemcpyExecute opsPerThread:{self.numOpsPerThread}, numThreads:{self.numThreads}, allocSize:{self.allocSize} srcUSM:{self.srcUSM} dstUSM:{self.dstUSM}" + (" without events" if not self.useEvents else "") def bin_args(self) -> list[str]: return [ "--Ioq=1", - "--UseEvents=1", + f"--UseEvents={self.useEvents}", "--MeasureCompletion=1", "--UseQueuePerThread=1", f"--AllocSize={self.allocSize}", diff --git a/scripts/benchmarks/benches/llamacpp.py b/scripts/benchmarks/benches/llamacpp.py index 3ff7963bd1..4a260a09cc 100644 --- a/scripts/benchmarks/benches/llamacpp.py +++ b/scripts/benchmarks/benches/llamacpp.py @@ -6,85 +6,14 @@ import csv import io from pathlib import Path -import re -import shutil from utils.utils import download, git_clone from .base import Benchmark, Suite from .result import Result from utils.utils import run, create_build_path from .options import options +from .oneapi import get_oneapi import os -class OneAPI: - # random unique number for benchmark oneAPI installation - ONEAPI_BENCHMARK_INSTANCE_ID = 98765 - def __init__(self, directory): - self.oneapi_dir = os.path.join(directory, 'oneapi') - Path(self.oneapi_dir).mkdir(parents=True, exist_ok=True) - # delete if some option is set? - - # can we just hardcode these links? - self.install_package('dnnl', 'https://registrationcenter-download.intel.com/akdlm/IRC_NAS/87e117ab-039b-437d-9c80-dcd5c9e675d5/intel-onednn-2025.0.0.862_offline.sh') - self.install_package('mkl', 'https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940_offline.sh') - return - - def install_package(self, name, url): - package_path = os.path.join(self.oneapi_dir, name) - if Path(package_path).exists(): - print(f"{package_path} exists, skipping installing oneAPI package {name}...") - return - - package = download(self.oneapi_dir, url, f'package_{name}.sh') - try: - print(f"installing f{name}") - run(f"sh {package} -a -s --eula accept --install-dir {self.oneapi_dir} --instance f{self.ONEAPI_BENCHMARK_INSTANCE_ID}") - except: - print("oneAPI installation likely exists already") - return - print(f"f{name} installation complete") - - def package_dir(self, package, dir): - return os.path.join(self.oneapi_dir, package, 'latest', dir) - - def package_cmake(self, package): - package_lib = self.package_dir(package, 'lib') - return os.path.join(package_lib, 'cmake', package) - - def mkl_lib(self): - return self.package_dir('mkl', 'lib') - - def mkl_include(self): - return self.package_dir('mkl', 'include') - - def mkl_cmake(self): - return self.package_cmake('mkl') - - def dnn_lib(self): - return self.package_dir('dnnl', 'lib') - - def dnn_include(self): - return self.package_dir('dnnl', 'include') - - def dnn_cmake(self): - return self.package_cmake('dnnl') - - def tbb_lib(self): - return self.package_dir('tbb', 'lib') - - def tbb_cmake(self): - return self.package_cmake('tbb') - - def compiler_lib(self): - return self.package_dir('compiler', 'lib') - - def ld_libraries(self): - return [ - self.compiler_lib(), - self.mkl_lib(), - self.tbb_lib(), - self.dnn_lib() - ] - class LlamaCppBench(Suite): def __init__(self, directory): if options.sycl is None: @@ -103,7 +32,7 @@ def setup(self): self.model = download(self.models_dir, "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf", "Phi-3-mini-4k-instruct-q4.gguf") - self.oneapi = OneAPI(self.directory) + self.oneapi = get_oneapi() self.build_path = create_build_path(self.directory, 'llamacpp-build') @@ -138,9 +67,6 @@ def __init__(self, bench): self.bench = bench super().__init__(bench.directory) - def unit(self): - return "token/s" - def setup(self): self.benchmark_bin = os.path.join(self.bench.build_path, 'bin', 'llama-bench') @@ -171,7 +97,7 @@ def run(self, env_vars) -> list[Result]: for r in parsed: (extra_label, mean) = r label = f"{self.name()} {extra_label}" - results.append(Result(label=label, value=mean, command=command, env=env_vars, stdout=result)) + results.append(Result(label=label, value=mean, command=command, env=env_vars, stdout=result, unit="token/s")) return results def parse_output(self, output): diff --git a/scripts/benchmarks/benches/oneapi.py b/scripts/benchmarks/benches/oneapi.py new file mode 100644 index 0000000000..414c4aa64a --- /dev/null +++ b/scripts/benchmarks/benches/oneapi.py @@ -0,0 +1,86 @@ +# Copyright (C) 2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +from pathlib import Path +from utils.utils import download, run +from .options import options +import os + +class OneAPI: + # random unique number for benchmark oneAPI installation + ONEAPI_BENCHMARK_INSTANCE_ID = 98765 + def __init__(self): + self.oneapi_dir = os.path.join(options.workdir, 'oneapi') + Path(self.oneapi_dir).mkdir(parents=True, exist_ok=True) + # delete if some option is set? + + # can we just hardcode these links? + self.install_package('dnnl', 'https://registrationcenter-download.intel.com/akdlm/IRC_NAS/87e117ab-039b-437d-9c80-dcd5c9e675d5/intel-onednn-2025.0.0.862_offline.sh') + self.install_package('mkl', 'https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940_offline.sh') + return + + def install_package(self, name, url): + package_path = os.path.join(self.oneapi_dir, name) + if Path(package_path).exists(): + print(f"{package_path} exists, skipping installing oneAPI package {name}...") + return + + package = download(self.oneapi_dir, url, f'package_{name}.sh') + try: + print(f"installing f{name}") + run(f"sh {package} -a -s --eula accept --install-dir {self.oneapi_dir} --instance f{self.ONEAPI_BENCHMARK_INSTANCE_ID}") + except: + print("oneAPI installation likely exists already") + return + print(f"f{name} installation complete") + + def package_dir(self, package, dir): + return os.path.join(self.oneapi_dir, package, 'latest', dir) + + def package_cmake(self, package): + package_lib = self.package_dir(package, 'lib') + return os.path.join(package_lib, 'cmake', package) + + def mkl_lib(self): + return self.package_dir('mkl', 'lib') + + def mkl_include(self): + return self.package_dir('mkl', 'include') + + def mkl_cmake(self): + return self.package_cmake('mkl') + + def dnn_lib(self): + return self.package_dir('dnnl', 'lib') + + def dnn_include(self): + return self.package_dir('dnnl', 'include') + + def dnn_cmake(self): + return self.package_cmake('dnnl') + + def tbb_lib(self): + return self.package_dir('tbb', 'lib') + + def tbb_cmake(self): + return self.package_cmake('tbb') + + def compiler_lib(self): + return self.package_dir('compiler', 'lib') + + def ld_libraries(self): + return [ + self.compiler_lib(), + self.mkl_lib(), + self.tbb_lib(), + self.dnn_lib() + ] + +oneapi_instance = None + +def get_oneapi() -> OneAPI: # oneAPI singleton + if not hasattr(get_oneapi, "instance"): + get_oneapi.instance = OneAPI() + return get_oneapi.instance diff --git a/scripts/benchmarks/benches/options.py b/scripts/benchmarks/benches/options.py index 5997cdedb8..03b0db7128 100644 --- a/scripts/benchmarks/benches/options.py +++ b/scripts/benchmarks/benches/options.py @@ -8,6 +8,7 @@ class Compare(Enum): @dataclass class Options: + workdir: str = None sycl: str = None ur: str = None ur_adapter: str = None diff --git a/scripts/benchmarks/benches/result.py b/scripts/benchmarks/benches/result.py index 07ee70148a..7d40040607 100644 --- a/scripts/benchmarks/benches/result.py +++ b/scripts/benchmarks/benches/result.py @@ -17,8 +17,8 @@ class Result: env: str stdout: str passed: bool = True - # values should not be set by the benchmark unit: str = "" + # values should not be set by the benchmark name: str = "" lower_is_better: bool = True git_hash: str = '' diff --git a/scripts/benchmarks/benches/syclbench.py b/scripts/benchmarks/benches/syclbench.py index fbfd009935..588f3ce998 100644 --- a/scripts/benchmarks/benches/syclbench.py +++ b/scripts/benchmarks/benches/syclbench.py @@ -99,9 +99,6 @@ def bin_args(self) -> list[str]: def extra_env_vars(self) -> dict: return {} - def unit(self): - return "ms" - def setup(self): self.benchmark_bin = os.path.join(self.directory, 'sycl-bench-build', self.bench_name) @@ -134,7 +131,8 @@ def run(self, env_vars) -> list[Result]: passed=(row[1]=="PASS"), command=command, env=env_vars, - stdout=row)) + stdout=row, + unit="ms")) self.done = True return res_list diff --git a/scripts/benchmarks/benches/test.py b/scripts/benchmarks/benches/test.py index 88bc29a649..802688f032 100644 --- a/scripts/benchmarks/benches/test.py +++ b/scripts/benchmarks/benches/test.py @@ -49,9 +49,6 @@ def __init__(self, name, value, diff): def name(self): return self.bname - def unit(self): - return "ms" - def lower_is_better(self): return True @@ -61,7 +58,7 @@ def setup(self): def run(self, env_vars) -> list[Result]: random_value = self.value + random.uniform(-1 * (self.diff), self.diff) return [ - Result(label=self.name(), value=random_value, command="", env={"A": "B"}, stdout="no output") + Result(label=self.name(), value=random_value, command="", env={"A": "B"}, stdout="no output", unit="ms") ] def teardown(self): diff --git a/scripts/benchmarks/benches/velocity.py b/scripts/benchmarks/benches/velocity.py index 856fd993db..705421d963 100644 --- a/scripts/benchmarks/benches/velocity.py +++ b/scripts/benchmarks/benches/velocity.py @@ -10,6 +10,9 @@ from .result import Result from utils.utils import run, create_build_path from .options import options +from .oneapi import get_oneapi +import shutil + import os class VelocityBench(Suite): @@ -35,20 +38,30 @@ def benchmarks(self) -> list[Benchmark]: CudaSift(self), Easywave(self), QuickSilver(self), - SobelFilter(self) + SobelFilter(self), + DLCifar(self), + DLMnist(self), + SVM(self) ] class VelocityBase(Benchmark): - def __init__(self, name: str, bin_name: str, vb: VelocityBench): + def __init__(self, name: str, bin_name: str, vb: VelocityBench, unit: str): super().__init__(vb.directory) self.vb = vb self.bench_name = name self.bin_name = bin_name + self.unit = unit self.code_path = os.path.join(self.vb.repo_path, self.bench_name, 'SYCL') def download_deps(self): return + def extra_cmake_args(self) -> list[str]: + return [] + + def ld_libraries(self) -> list[str]: + return [] + def setup(self): self.download_deps() self.benchmark_bin = os.path.join(self.directory, self.bench_name, self.bin_name) @@ -61,8 +74,10 @@ def setup(self): f"-S {self.code_path}", f"-DCMAKE_BUILD_TYPE=Release" ] + configure_command += self.extra_cmake_args() + run(configure_command, {'CC': 'clang', 'CXX':'clang++'}, add_sycl=True) - run(f"cmake --build {build_path} -j", add_sycl=True) + run(f"cmake --build {build_path} -j", add_sycl=True, ld_library=self.ld_libraries()) def bin_args(self) -> list[str]: return [] @@ -81,23 +96,20 @@ def run(self, env_vars) -> list[Result]: ] command += self.bin_args() - result = self.run_bench(command, env_vars) + result = self.run_bench(command, env_vars, ld_library=self.ld_libraries()) - return [ Result(label=self.name(), value=self.parse_output(result), command=command, env=env_vars, stdout=result) ] + return [ Result(label=self.name(), value=self.parse_output(result), command=command, env=env_vars, stdout=result, unit=self.unit) ] def teardown(self): return class Hashtable(VelocityBase): def __init__(self, vb: VelocityBench): - super().__init__("hashtable", "hashtable_sycl", vb) + super().__init__("hashtable", "hashtable_sycl", vb, "M keys/sec") def name(self): return "Velocity-Bench Hashtable" - def unit(self): - return "M keys/sec" - def bin_args(self) -> list[str]: return ["--no-verify"] @@ -114,15 +126,12 @@ def parse_output(self, stdout: str) -> float: class Bitcracker(VelocityBase): def __init__(self, vb: VelocityBench): - super().__init__("bitcracker", "bitcracker", vb) + super().__init__("bitcracker", "bitcracker", vb, "s") self.data_path = os.path.join(vb.repo_path, "bitcracker", "hash_pass") def name(self): return "Velocity-Bench Bitcracker" - def unit(self): - return "s" - def bin_args(self) -> list[str]: return ["-f", f"{self.data_path}/img_win8_user_hash.txt", "-d", f"{self.data_path}/user_passwords_60000.txt", @@ -137,18 +146,14 @@ def parse_output(self, stdout: str) -> float: class SobelFilter(VelocityBase): def __init__(self, vb: VelocityBench): - super().__init__("sobel_filter", "sobel_filter", vb) + super().__init__("sobel_filter", "sobel_filter", vb, "ms") def download_deps(self): self.download("sobel_filter", "https://github.com/oneapi-src/Velocity-Bench/raw/main/sobel_filter/res/sobel_filter_data.tgz?download=", "sobel_filter_data.tgz", untar=True) - return def name(self): return "Velocity-Bench Sobel Filter" - def unit(self): - return "ms" - def bin_args(self) -> list[str]: return ["-i", f"{self.data_path}/sobel_filter_data/silverfalls_32Kx32K.png", "-n", "5"] @@ -166,7 +171,7 @@ def parse_output(self, stdout: str) -> float: class QuickSilver(VelocityBase): def __init__(self, vb: VelocityBench): - super().__init__("QuickSilver", "qs", vb) + super().__init__("QuickSilver", "qs", vb, "MMS/CTT") self.data_path = os.path.join(vb.repo_path, "QuickSilver", "Examples", "AllScattering") def run(self, env_vars) -> list[Result]: @@ -179,9 +184,6 @@ def run(self, env_vars) -> list[Result]: def name(self): return "Velocity-Bench QuickSilver" - def unit(self): - return "MMS/CTT" - def lower_is_better(self): return False @@ -200,7 +202,7 @@ def parse_output(self, stdout: str) -> float: class Easywave(VelocityBase): def __init__(self, vb: VelocityBench): - super().__init__("easywave", "easyWave_sycl", vb) + super().__init__("easywave", "easyWave_sycl", vb, "ms") def download_deps(self): self.download("easywave", "https://git.gfz-potsdam.de/id2/geoperil/easyWave/-/raw/master/data/examples.tar.gz", "examples.tar.gz", untar=True) @@ -208,9 +210,6 @@ def download_deps(self): def name(self): return "Velocity-Bench Easywave" - def unit(self): - return "ms" - def bin_args(self) -> list[str]: return ["-grid", f"{self.data_path}/examples/e2Asean.grd", "-source", f"{self.data_path}/examples/BengkuluSept2007.flt", @@ -242,10 +241,9 @@ def get_last_elapsed_time(self, log_file_path) -> float: def parse_output(self, stdout: str) -> float: return self.get_last_elapsed_time(os.path.join(options.benchmark_cwd, "easywave.log")) - class CudaSift(VelocityBase): def __init__(self, vb: VelocityBench): - super().__init__("cudaSift", "cudaSift", vb) + super().__init__("cudaSift", "cudaSift", vb, "ms") def download_deps(self): images = os.path.join(self.vb.repo_path, self.bench_name, 'inputData') @@ -256,12 +254,109 @@ def download_deps(self): def name(self): return "Velocity-Bench CudaSift" - def unit(self): - return "ms" - def parse_output(self, stdout: str) -> float: match = re.search(r'Avg workload time = (\d+\.\d+) ms', stdout) if match: return float(match.group(1)) else: raise ValueError("Failed to parse benchmark output.") + +class DLCifar(VelocityBase): + def __init__(self, vb: VelocityBench): + self.oneapi = get_oneapi() + super().__init__("dl-cifar", "dl-cifar_sycl", vb, "s") + + def ld_libraries(self): + return self.oneapi.ld_libraries() + + def download_deps(self): + # TODO: dl-cifar hardcodes the path to this dataset as "../../datasets/cifar-10-binary"... + self.download("datasets", "https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz", "cifar-10-binary.tar.gz", untar=True, skip_data_dir=True) + return + + def extra_cmake_args(self): + return [ + f"-DCMAKE_CXX_FLAGS=-O3 -fsycl -ffast-math -I{self.oneapi.dnn_include()} -I{self.oneapi.mkl_include()} -L{self.oneapi.dnn_lib()} -L{self.oneapi.mkl_lib()}" + ] + + def name(self): + return "Velocity-Bench dl-cifar" + + def parse_output(self, stdout: str) -> float: + match = re.search(r'dl-cifar - total time for whole calculation: (\d+\.\d+) s', stdout) + if match: + return float(match.group(1)) + else: + raise ValueError("Failed to parse benchmark output.") + +class DLMnist(VelocityBase): + def __init__(self, vb: VelocityBench): + self.oneapi = get_oneapi() + super().__init__("dl-mnist", "dl-mnist-sycl", vb, "s") + + def ld_libraries(self): + return self.oneapi.ld_libraries() + + def download_deps(self): + # TODO: dl-mnist hardcodes the path to this dataset as "../../datasets/"... + self.download("datasets", "https://raw.githubusercontent.com/fgnt/mnist/master/train-images-idx3-ubyte.gz", "train-images.idx3-ubyte.gz", unzip=True, skip_data_dir=True) + self.download("datasets", "https://raw.githubusercontent.com/fgnt/mnist/master/train-labels-idx1-ubyte.gz", "train-labels.idx1-ubyte.gz", unzip=True, skip_data_dir=True) + self.download("datasets", "https://raw.githubusercontent.com/fgnt/mnist/master/t10k-images-idx3-ubyte.gz", "t10k-images.idx3-ubyte.gz", unzip=True, skip_data_dir=True) + self.download("datasets", "https://raw.githubusercontent.com/fgnt/mnist/master/t10k-labels-idx1-ubyte.gz", "t10k-labels.idx1-ubyte.gz", unzip=True, skip_data_dir=True) + + def extra_cmake_args(self): + return [ + f"-DCMAKE_CXX_FLAGS=-O3 -fsycl -ffast-math -I{self.oneapi.dnn_include()} -I{self.oneapi.mkl_include()} -L{self.oneapi.dnn_lib()} -L{self.oneapi.mkl_lib()}" + ] + + def name(self): + return "Velocity-Bench dl-mnist" + + def bin_args(self): + return [ + "-conv_algo", "ONEDNN_AUTO" + ] + + # TODO: This shouldn't be required. + # The application crashes with a segfault without it. + def extra_env_vars(self): + return { + "NEOReadDebugKeys":"1", + "DisableScratchPages":"0", + } + + def parse_output(self, stdout: str) -> float: + match = re.search(r'dl-mnist - total time for whole calculation: (\d+\.\d+) s', stdout) + if match: + return float(match.group(1)) + else: + raise ValueError("Failed to parse benchmark output.") + +class SVM(VelocityBase): + def __init__(self, vb: VelocityBench): + self.oneapi = get_oneapi() + super().__init__("svm", "svm_sycl", vb, "s") + + def ld_libraries(self): + return self.oneapi.ld_libraries() + + def extra_cmake_args(self): + return [ + f"-DCMAKE_CXX_FLAGS=-O3 -fsycl -ffast-math -I{self.oneapi.dnn_include()} -I{self.oneapi.mkl_include()} -L{self.oneapi.dnn_lib()} -L{self.oneapi.mkl_lib()}" + ] + + def name(self): + return "Velocity-Bench svm" + + def bin_args(self): + return [ + f"{self.code_path}/a9a", + f"{self.code_path}/a.m", + ] + + def parse_output(self, stdout: str) -> float: + match = re.search(r'Total elapsed time : (\d+\.\d+) s', stdout) + if match: + return float(match.group(1)) + else: + raise ValueError("Failed to parse benchmark output.") diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py index 9dd77f14b2..bca0f01553 100755 --- a/scripts/benchmarks/main.py +++ b/scripts/benchmarks/main.py @@ -73,7 +73,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter): if bench_results is not None: for bench_result in bench_results: if bench_result.passed: - print(f"complete ({bench_result.label}: {bench_result.value:.3f} {benchmark.unit()}).") + print(f"complete ({bench_result.label}: {bench_result.value:.3f} {bench_result.unit}).") else: print(f"complete ({bench_result.label}: verification FAILED)") iteration_results.append(bench_result) @@ -91,7 +91,6 @@ def main(directory, additional_env_vars, save_name, compare_names, filter): median_index = len(label_results) // 2 median_result = label_results[median_index] - median_result.unit = benchmark.unit() median_result.name = label median_result.lower_is_better = benchmark.lower_is_better() @@ -180,6 +179,7 @@ def validate_and_parse_env_args(env_args): args = parser.parse_args() additional_env_vars = validate_and_parse_env_args(args.env) + options.workdir = args.benchmark_directory options.verbose = args.verbose options.rebuild = not args.no_rebuild options.sycl = args.sycl diff --git a/scripts/benchmarks/utils/utils.py b/scripts/benchmarks/utils/utils.py index d077184e5c..0bb954fab2 100644 --- a/scripts/benchmarks/utils/utils.py +++ b/scripts/benchmarks/utils/utils.py @@ -3,6 +3,7 @@ # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +import gzip import os import shutil import subprocess @@ -58,7 +59,7 @@ def git_clone(dir, name, repo, commit): return repo_path def prepare_bench_cwd(dir): - # we need 2 deep to workaround a problem with a fixed relative path in cudaSift + # we need 2 deep to workaround a problem with a fixed relative paths in some velocity benchmarks options.benchmark_cwd = os.path.join(dir, 'bcwd', 'bcwd') if os.path.exists(options.benchmark_cwd): shutil.rmtree(options.benchmark_cwd) @@ -97,7 +98,7 @@ def create_build_path(directory, name): return build_path -def download(dir, url, file, untar = False): +def download(dir, url, file, untar = False, unzip = False): data_file = os.path.join(dir, file) if not Path(data_file).exists(): print(f"{data_file} does not exist, downloading") @@ -106,6 +107,10 @@ def download(dir, url, file, untar = False): file = tarfile.open(data_file) file.extractall(dir) file.close() + if unzip: + [stripped_gz, _] = os.path.splitext(data_file) + with gzip.open(data_file, 'rb') as f_in, open(stripped_gz, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) else: print(f"{data_file} exists, skipping...") return data_file diff --git a/scripts/core/CUDA.rst b/scripts/core/CUDA.rst index 9771693113..08b61bf9dc 100644 --- a/scripts/core/CUDA.rst +++ b/scripts/core/CUDA.rst @@ -148,6 +148,39 @@ take the extra global offset argument. Use of the global offset is not recommended for non SYCL compiler toolchains. This parameter can be ignored if the user does not wish to use the global offset. +Local Memory Arguments +---------------------- + +In UR local memory is a region of memory shared by all the work-items in +a work-group. A kernel function signature can include local memory address +space pointer arguments, which are set by the user with +``urKernelSetArgLocal`` with the number of bytes of local memory to allocate +and make available from the pointer argument. + +The CUDA adapter implements local memory in a kernel as a single ``__shared__`` +memory allocation, and each individual local memory argument is a ``u32`` byte +offset kernel parameter which is combined inside the kernel with the +``__shared__`` memory allocation. Therefore for ``N`` local arguments that need +set on a kernel with ``urKernelSetArgLocal``, the total aligned size across the +``N`` calls to ``urKernelSetArgLocal`` is calculated for the ``__shared__`` +memory allocation by the CUDA adapter and passed as the ``sharedMemBytes`` +argument to ``cuLaunchKernel`` (or variants like ``cuLaunchCooperativeKernel`` +or ``cuGraphAddKernelNode``). + +For each kernel ``u32`` local memory offset parameter, aligned offsets into the +single memory location are calculated and passed at runtime by the adapter via +``kernelParams`` when launching the kernel (or adding the kernel as a graph +node). When a user calls ``urKernelSetArgLocal`` with an argument index that +has already been set on the kernel, the adapter recalculates the size of the +``__shared__`` memory allocation and offset for the index, as well as the +offsets of any local memory arguments at following indices. + +.. warning:: + + The CUDA UR adapter implementation of local memory assumes the kernel created + has been created by DPC++, instrumenting the device code so that local memory + arguments are offsets rather than pointers. + Other Notes =========== @@ -164,4 +197,5 @@ Contributors ------------ * Hugh Delaney `hugh.delaney@codeplay.com `_ +* Ewan Crawford `ewan@codeplay.com `_ diff --git a/scripts/core/EXP-2D-BLOCK-ARRAY-CAPABILITIES.rst b/scripts/core/EXP-2D-BLOCK-ARRAY-CAPABILITIES.rst new file mode 100644 index 0000000000..765b07cfb3 --- /dev/null +++ b/scripts/core/EXP-2D-BLOCK-ARRAY-CAPABILITIES.rst @@ -0,0 +1,62 @@ +<% + OneApi=tags['$OneApi'] + x=tags['$x'] + X=x.upper() +%> + +.. _experimental-2D-block-array-capabilities: + +================================================================================ +2D Block Array Capabilities +================================================================================ + +.. warning:: + + Experimental features: + + * May be replaced, updated, or removed at any time. + * Do not require maintaining API/ABI stability of their own additions over + time. + * Do not require conformance testing of their own additions. + + +Motivation +-------------------------------------------------------------------------------- +Some Intel GPU devices support 2D block array operations which may be used to optimize applications on Intel GPUs. +This extension provides a device descriptor which allows to query the 2D block array capabilities of a device. + +API +-------------------------------------------------------------------------------- + +Enums +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* ${x}_device_info_t + * ${X}_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP + +* ${x}_exp_device_2d_block_array_capability_flags_t + * ${X}_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_LOAD + * ${X}_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_STORE + +Changelog +-------------------------------------------------------------------------------- + ++-----------+------------------------+ +| Revision | Changes | ++===========+========================+ +| 1.0 | Initial Draft | ++-----------+------------------------+ + + +Support +-------------------------------------------------------------------------------- + +Adapters which support this experimental feature *must* return ${X}_RESULT_SUCCESS from +the ${x}DeviceGetInfo call with the new ${X}_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP +device descriptor. + + +Contributors +-------------------------------------------------------------------------------- + +* Artur Gainullin `artur.gainullin@intel.com `_ diff --git a/scripts/core/EXP-COMMAND-BUFFER.rst b/scripts/core/EXP-COMMAND-BUFFER.rst index d6ef76c7bc..2313afd036 100644 --- a/scripts/core/EXP-COMMAND-BUFFER.rst +++ b/scripts/core/EXP-COMMAND-BUFFER.rst @@ -256,6 +256,11 @@ ${x}CommandBufferAppendKernelLaunchExp. The command can then be updated to use the new kernel handle by passing it to ${x}CommandBufferUpdateKernelLaunchExp. +.. important:: + When updating the kernel handle of a command all required arguments to the + new kernel must be provided in the update descriptor. Failure to do so will + result in undefined behavior. + .. parsed-literal:: // Create a command-buffer with update enabled. diff --git a/scripts/core/HIP.rst b/scripts/core/HIP.rst index 3ded0138ff..920a5f5a3e 100644 --- a/scripts/core/HIP.rst +++ b/scripts/core/HIP.rst @@ -91,6 +91,46 @@ take the extra global offset argument. Use of the global offset is not recommended for non SYCL compiler toolchains. This parameter can be ignored if the user does not wish to use the global offset. +Local Memory Arguments +---------------------- + +In UR local memory is a region of memory shared by all the work-items in +a work-group. A kernel function signature can include local memory address +space pointer arguments, which are set by the user with +``urKernelSetArgLocal`` with the number of bytes of local memory to allocate +and make available from the pointer argument. + +The HIP adapter implements local memory in a kernel as a single ``__shared__`` +memory allocation, and each individual local memory argument is a ``u32`` byte +offset kernel parameter which is combined inside the kernel with the +``__shared__`` memory allocation. Therefore for ``N`` local arguments that need +set on a kernel with ``urKernelSetArgLocal``, the total aligned size across the +``N`` calls to ``urKernelSetArgLocal`` is calculated for the ``__shared__`` +memory allocation by the HIP adapter and passed as the ``sharedMemBytes`` +argument to ``hipModuleLaunchKernel`` or ``hipGraphAddKernelNode``. + +For each kernel ``u32`` local memory offset parameter, aligned offsets into the +single memory location are calculated and passed at runtime by the adapter via +``kernelParams`` when launching the kernel (or adding the kernel as a graph +node). When a user calls ``urKernelSetArgLocal`` with an argument index that +has already been set on the kernel, the adapter recalculates the size of the +``__shared__`` memory allocation and offset for the index, as well as the +offsets of any local memory arguments at following indices. + +.. warning:: + + The HIP UR adapter implementation of local memory assumes the kernel created + has been created by DPC++, instrumenting the device code so that local memory + arguments are offsets rather than pointers. + + +HIP kernels that are generated for DPC++ kernels with SYCL local accessors +contain extra value arguments on top of the local memory argument for the +local accessor. For each ``urKernelSetArgLocal`` argument, a user needs +to make 3 calls to ``urKernelSetArgValue`` with each of the next 3 consecutive +argument indexes. This represents a 3 dimensional offset into the local +accessor. + Other Notes =========== @@ -100,4 +140,5 @@ Contributors ------------ * Hugh Delaney `hugh.delaney@codeplay.com `_ +* Ewan Crawford `ewan@codeplay.com `_ diff --git a/scripts/core/LEVEL_ZERO.rst b/scripts/core/LEVEL_ZERO.rst index caffa388a2..e5e33e2a5b 100644 --- a/scripts/core/LEVEL_ZERO.rst +++ b/scripts/core/LEVEL_ZERO.rst @@ -28,7 +28,7 @@ Environment Variables | UR_L0_USE_RELAXED_ALLOCATION_LIMITS | Controls the use of relaxed allocation limits. | "0": Relaxed allocation limits are not used. | "0" | | | | "1": Relaxed allocation limits are used. | | +---------------------------------------------+--------------------------------------------------------------+--------------------------------------------------------------+------------------+ -| UR_L0_USE_DRIVER_IN_ORDER_LISTS | Controls the use of in-order lists from the driver. | "0": In-order lists from the driver are not used. | "0" | +| UR_L0_USE_DRIVER_INORDER_LISTS | Controls the use of in-order lists from the driver. | "0": In-order lists from the driver are not used. | "0" | | | | "1": In-order lists from the driver are used. | | +---------------------------------------------+--------------------------------------------------------------+--------------------------------------------------------------+------------------+ | UR_L0_USM_ALLOCATOR_TRACE | Enables tracing for the USM allocator. | "0": Tracing is disabled. | "0" | diff --git a/scripts/core/adapter.yml b/scripts/core/adapter.yml index a4eddd823c..4fc9a104ed 100644 --- a/scripts/core/adapter.yml +++ b/scripts/core/adapter.yml @@ -136,6 +136,11 @@ etors: [uint32_t] Reference count of the adapter. The reference count returned should be considered immediately stale. It is unsuitable for general use in applications. This feature is provided for identifying memory leaks. + - name: VERSION + desc: > + [uint32_t] Specifies the adapter version, initial value of 1 and + incremented unpon major changes, e.g. when multiple versions of an + adapter may exist in parallel. --- #-------------------------------------------------------------------------- type: function desc: "Retrieves information about the adapter" diff --git a/scripts/core/exp-2d-block-array-capabilities.yml b/scripts/core/exp-2d-block-array-capabilities.yml new file mode 100644 index 0000000000..ec62c4bb96 --- /dev/null +++ b/scripts/core/exp-2d-block-array-capabilities.yml @@ -0,0 +1,36 @@ +# +# Copyright (C) 2024 Intel Corporation +# +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# See YaML.md for syntax definition +# +--- #-------------------------------------------------------------------------- +type: header +desc: "Intel $OneApi Unified Runtime Experimental device descriptor for querying Intel device 2D block array capabilities" +ordinal: "99" +--- #-------------------------------------------------------------------------- +type: enum +extend: true +typed_etors: true +desc: "Extension enum to $x_device_info_t to query Intel device 2D block array capabilities." +name: $x_device_info_t +etors: + - name: 2D_BLOCK_ARRAY_CAPABILITIES_EXP + value: "0x2022" + desc: "[$x_exp_device_2d_block_array_capability_flags_t] return a bit-field of Intel GPU 2D block array capabilities" +--- #-------------------------------------------------------------------------- +type: enum +desc: "Intel GPU 2D block array capabilities" +class: $xDevice +name: $x_exp_device_2d_block_array_capability_flags_t +etors: + - name: LOAD + desc: "Load instructions are supported" + value: "$X_BIT(0)" + - name: STORE + desc: "Store instructions are supported" + value: "$X_BIT(1)" + diff --git a/source/adapters/cuda/adapter.cpp b/source/adapters/cuda/adapter.cpp index c8949cd9a8..49bb964f8e 100644 --- a/source/adapters/cuda/adapter.cpp +++ b/source/adapters/cuda/adapter.cpp @@ -108,6 +108,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t, return ReturnValue(UR_ADAPTER_BACKEND_CUDA); case UR_ADAPTER_INFO_REFERENCE_COUNT: return ReturnValue(adapter.RefCount.load()); + case UR_ADAPTER_INFO_VERSION: + return ReturnValue(uint32_t{1}); default: return UR_RESULT_ERROR_INVALID_ENUMERATION; } diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index 527c339783..4b4b2cffe5 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -522,9 +522,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( DepsList.data(), DepsList.size(), &NodeParams)); - if (LocalSize != 0) - hKernel->clearLocalSize(); - // Add signal node if external return event is used. CUgraphNode SignalNode = nullptr; if (phEvent) { diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp index cb6b757dd3..d8916ccedd 100644 --- a/source/adapters/cuda/device.cpp +++ b/source/adapters/cuda/device.cpp @@ -1088,7 +1088,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE: case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU: return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; - + case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP: + return ReturnValue( + static_cast(0)); case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: case UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP: return ReturnValue(true); diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index fc3d0220e8..54a0f778fb 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -493,9 +493,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize, CuStream, const_cast(ArgIndices.data()), nullptr)); - if (LocalSize != 0) - hKernel->clearLocalSize(); - if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); *phEvent = RetImplEvent.release(); @@ -673,9 +670,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( const_cast(ArgIndices.data()), nullptr)); - if (LocalSize != 0) - hKernel->clearLocalSize(); - if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); *phEvent = RetImplEvent.release(); diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp index 7ad20a4f0e..2b04dfba43 100644 --- a/source/adapters/cuda/kernel.hpp +++ b/source/adapters/cuda/kernel.hpp @@ -61,10 +61,22 @@ struct ur_kernel_handle_t_ { using args_t = std::array; using args_size_t = std::vector; using args_index_t = std::vector; + /// Storage shared by all args which is mem copied into when adding a new + /// argument. args_t Storage; + /// Aligned size of each parameter, including padding. args_size_t ParamSizes; + /// Byte offset into /p Storage allocation for each parameter. args_index_t Indices; - args_size_t OffsetPerIndex; + /// Aligned size in bytes for each local memory parameter after padding has + /// been added. Zero if the argument at the index isn't a local memory + /// argument. + args_size_t AlignedLocalMemSize; + /// Original size in bytes for each local memory parameter, prior to being + /// padded to appropriate alignment. Zero if the argument at the index + /// isn't a local memory argument. + args_size_t OriginalLocalMemSize; + // A struct to keep track of memargs so that we can do dependency analysis // at urEnqueueKernelLaunch struct mem_obj_arg { @@ -93,7 +105,8 @@ struct ur_kernel_handle_t_ { Indices.resize(Index + 2, Indices.back()); // Ensure enough space for the new argument ParamSizes.resize(Index + 1); - OffsetPerIndex.resize(Index + 1); + AlignedLocalMemSize.resize(Index + 1); + OriginalLocalMemSize.resize(Index + 1); } ParamSizes[Index] = Size; // calculate the insertion point on the array @@ -102,28 +115,81 @@ struct ur_kernel_handle_t_ { // Update the stored value for the argument std::memcpy(&Storage[InsertPos], Arg, Size); Indices[Index] = &Storage[InsertPos]; - OffsetPerIndex[Index] = LocalSize; + AlignedLocalMemSize[Index] = LocalSize; } - void addLocalArg(size_t Index, size_t Size) { - size_t LocalOffset = this->getLocalSize(); + /// Returns the padded size and offset of a local memory argument. + /// Local memory arguments need to be padded if the alignment for the size + /// doesn't match the current offset into the kernel local data. + /// @param Index Kernel arg index. + /// @param Size User passed size of local parameter. + /// @return Tuple of (Aligned size, Aligned offset into local data). + std::pair calcAlignedLocalArgument(size_t Index, + size_t Size) { + // Store the unpadded size of the local argument + if (Index + 2 > Indices.size()) { + AlignedLocalMemSize.resize(Index + 1); + OriginalLocalMemSize.resize(Index + 1); + } + OriginalLocalMemSize[Index] = Size; + + // Calculate the current starting offset into local data + const size_t LocalOffset = std::accumulate( + std::begin(AlignedLocalMemSize), + std::next(std::begin(AlignedLocalMemSize), Index), size_t{0}); - // maximum required alignment is the size of the largest vector type + // Maximum required alignment is the size of the largest vector type const size_t MaxAlignment = sizeof(double) * 16; - // for arguments smaller than the maximum alignment simply align to the + // For arguments smaller than the maximum alignment simply align to the // size of the argument const size_t Alignment = std::min(MaxAlignment, Size); - // align the argument + // Align the argument size_t AlignedLocalOffset = LocalOffset; - size_t Pad = LocalOffset % Alignment; + const size_t Pad = LocalOffset % Alignment; if (Pad != 0) { AlignedLocalOffset += Alignment - Pad; } + const size_t AlignedLocalSize = Size + (AlignedLocalOffset - LocalOffset); + return std::make_pair(AlignedLocalSize, AlignedLocalOffset); + } + + void addLocalArg(size_t Index, size_t Size) { + // Get the aligned argument size and offset into local data + auto [AlignedLocalSize, AlignedLocalOffset] = + calcAlignedLocalArgument(Index, Size); + + // Store argument details addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset), - Size + (AlignedLocalOffset - LocalOffset)); + AlignedLocalSize); + + // For every existing local argument which follows at later argument + // indices, update the offset and pointer into the kernel local memory. + // Required as padding will need to be recalculated. + const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg + for (auto SuccIndex = Index + 1; SuccIndex < NumArgs; SuccIndex++) { + const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex]; + if (OriginalLocalSize == 0) { + // Skip if successor argument isn't a local memory arg + continue; + } + + // Recalculate alignment + auto [SuccAlignedLocalSize, SuccAlignedLocalOffset] = + calcAlignedLocalArgument(SuccIndex, OriginalLocalSize); + + // Store new local memory size + AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize; + + // Store new offset into local data + const size_t InsertPos = + std::accumulate(std::begin(ParamSizes), + std::begin(ParamSizes) + SuccIndex, size_t{0}); + std::memcpy(&Storage[InsertPos], &SuccAlignedLocalOffset, + sizeof(size_t)); + } } void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) { @@ -145,15 +211,11 @@ struct ur_kernel_handle_t_ { std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size); } - void clearLocalSize() { - std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0); - } - const args_index_t &getIndices() const noexcept { return Indices; } uint32_t getLocalSize() const { - return std::accumulate(std::begin(OffsetPerIndex), - std::end(OffsetPerIndex), 0); + return std::accumulate(std::begin(AlignedLocalMemSize), + std::end(AlignedLocalMemSize), 0); } } Args; @@ -240,7 +302,5 @@ struct ur_kernel_handle_t_ { uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); } - void clearLocalSize() { Args.clearLocalSize(); } - size_t getRegsPerThread() const noexcept { return RegsPerThread; }; }; diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp index a9559eb188..4b13e6669c 100644 --- a/source/adapters/cuda/ur_interface_loader.cpp +++ b/source/adapters/cuda/ur_interface_loader.cpp @@ -174,6 +174,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite; pDdiTable->pfnEventsWait = urEnqueueEventsWait; pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier; + pDdiTable->pfnEventsWaitWithBarrierExt = urEnqueueEventsWaitWithBarrierExt; pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch; pDdiTable->pfnMemBufferCopy = urEnqueueMemBufferCopy; pDdiTable->pfnMemBufferCopyRect = urEnqueueMemBufferCopyRect; diff --git a/source/adapters/hip/adapter.cpp b/source/adapters/hip/adapter.cpp index 99db21695f..1bfe498bf6 100644 --- a/source/adapters/hip/adapter.cpp +++ b/source/adapters/hip/adapter.cpp @@ -96,6 +96,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t, return ReturnValue(UR_ADAPTER_BACKEND_HIP); case UR_ADAPTER_INFO_REFERENCE_COUNT: return ReturnValue(adapter.RefCount.load()); + case UR_ADAPTER_INFO_VERSION: + return ReturnValue(uint32_t{1}); default: return UR_RESULT_ERROR_INVALID_ENUMERATION; } diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp index 9fed5db2f8..538c2ff85a 100644 --- a/source/adapters/hip/command_buffer.cpp +++ b/source/adapters/hip/command_buffer.cpp @@ -396,9 +396,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( DepsList.data(), DepsList.size(), &NodeParams)); - if (LocalSize != 0) - hKernel->clearLocalSize(); - // Get sync point and register the node with it. auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); if (pSyncPoint) { diff --git a/source/adapters/hip/device.cpp b/source/adapters/hip/device.cpp index 5271f73709..eed6a1c7c5 100644 --- a/source/adapters/hip/device.cpp +++ b/source/adapters/hip/device.cpp @@ -905,6 +905,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_IL_VERSION: case UR_DEVICE_INFO_ASYNC_BARRIER: return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP: + return ReturnValue( + static_cast(0)); case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: { int DriverVersion = 0; UR_CHECK_ERROR(hipDriverGetVersion(&DriverVersion)); diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index 025a3f41f4..b9aa097848 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -324,8 +324,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], hKernel->getLocalSize(), HIPStream, ArgIndices.data(), nullptr)); - hKernel->clearLocalSize(); - if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); *phEvent = RetImplEvent.release(); diff --git a/source/adapters/hip/kernel.hpp b/source/adapters/hip/kernel.hpp index afea69832b..c6d30e81ad 100644 --- a/source/adapters/hip/kernel.hpp +++ b/source/adapters/hip/kernel.hpp @@ -56,10 +56,22 @@ struct ur_kernel_handle_t_ { using args_t = std::array; using args_size_t = std::vector; using args_index_t = std::vector; + /// Storage shared by all args which is mem copied into when adding a new + /// argument. args_t Storage; + /// Aligned size of each parameter, including padding. args_size_t ParamSizes; + /// Byte offset into /p Storage allocation for each parameter. args_index_t Indices; - args_size_t OffsetPerIndex; + /// Aligned size in bytes for each local memory parameter after padding has + /// been added. Zero if the argument at the index isn't a local memory + /// argument. + args_size_t AlignedLocalMemSize; + /// Original size in bytes for each local memory parameter, prior to being + /// padded to appropriate alignment. Zero if the argument at the index + /// isn't a local memory argument. + args_size_t OriginalLocalMemSize; + // A struct to keep track of memargs so that we can do dependency analysis // at urEnqueueKernelLaunch struct mem_obj_arg { @@ -88,7 +100,8 @@ struct ur_kernel_handle_t_ { Indices.resize(Index + 2, Indices.back()); // Ensure enough space for the new argument ParamSizes.resize(Index + 1); - OffsetPerIndex.resize(Index + 1); + AlignedLocalMemSize.resize(Index + 1); + OriginalLocalMemSize.resize(Index + 1); } ParamSizes[Index] = Size; // calculate the insertion point on the array @@ -97,28 +110,81 @@ struct ur_kernel_handle_t_ { // Update the stored value for the argument std::memcpy(&Storage[InsertPos], Arg, Size); Indices[Index] = &Storage[InsertPos]; - OffsetPerIndex[Index] = LocalSize; + AlignedLocalMemSize[Index] = LocalSize; } - void addLocalArg(size_t Index, size_t Size) { - size_t LocalOffset = this->getLocalSize(); + /// Returns the padded size and offset of a local memory argument. + /// Local memory arguments need to be padded if the alignment for the size + /// doesn't match the current offset into the kernel local data. + /// @param Index Kernel arg index. + /// @param Size User passed size of local parameter. + /// @return Tuple of (Aligned size, Aligned offset into local data). + std::pair calcAlignedLocalArgument(size_t Index, + size_t Size) { + // Store the unpadded size of the local argument + if (Index + 2 > Indices.size()) { + AlignedLocalMemSize.resize(Index + 1); + OriginalLocalMemSize.resize(Index + 1); + } + OriginalLocalMemSize[Index] = Size; - // maximum required alignment is the size of the largest vector type + // Calculate the current starting offset into local data + const size_t LocalOffset = std::accumulate( + std::begin(AlignedLocalMemSize), + std::next(std::begin(AlignedLocalMemSize), Index), size_t{0}); + + // Maximum required alignment is the size of the largest vector type const size_t MaxAlignment = sizeof(double) * 16; - // for arguments smaller than the maximum alignment simply align to the + // For arguments smaller than the maximum alignment simply align to the // size of the argument const size_t Alignment = std::min(MaxAlignment, Size); - // align the argument + // Align the argument size_t AlignedLocalOffset = LocalOffset; - size_t Pad = LocalOffset % Alignment; + const size_t Pad = LocalOffset % Alignment; if (Pad != 0) { AlignedLocalOffset += Alignment - Pad; } - addArg(Index, sizeof(size_t), (const void *)&AlignedLocalOffset, - Size + AlignedLocalOffset - LocalOffset); + const size_t AlignedLocalSize = Size + (AlignedLocalOffset - LocalOffset); + return std::make_pair(AlignedLocalSize, AlignedLocalOffset); + } + + void addLocalArg(size_t Index, size_t Size) { + // Get the aligned argument size and offset into local data + auto [AlignedLocalSize, AlignedLocalOffset] = + calcAlignedLocalArgument(Index, Size); + + // Store argument details + addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset), + AlignedLocalSize); + + // For every existing local argument which follows at later argument + // indices, update the offset and pointer into the kernel local memory. + // Required as padding will need to be recalculated. + const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg + for (auto SuccIndex = Index + 1; SuccIndex < NumArgs; SuccIndex++) { + const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex]; + if (OriginalLocalSize == 0) { + // Skip if successor argument isn't a local memory arg + continue; + } + + // Recalculate alignment + auto [SuccAlignedLocalSize, SuccAlignedLocalOffset] = + calcAlignedLocalArgument(SuccIndex, OriginalLocalSize); + + // Store new local memory size + AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize; + + // Store new offset into local data + const size_t InsertPos = + std::accumulate(std::begin(ParamSizes), + std::begin(ParamSizes) + SuccIndex, size_t{0}); + std::memcpy(&Storage[InsertPos], &SuccAlignedLocalOffset, + sizeof(size_t)); + } } void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) { @@ -140,15 +206,11 @@ struct ur_kernel_handle_t_ { std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size); } - void clearLocalSize() { - std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0); - } - const args_index_t &getIndices() const noexcept { return Indices; } uint32_t getLocalSize() const { - return std::accumulate(std::begin(OffsetPerIndex), - std::end(OffsetPerIndex), 0); + return std::accumulate(std::begin(AlignedLocalMemSize), + std::end(AlignedLocalMemSize), 0); } } Args; @@ -220,6 +282,4 @@ struct ur_kernel_handle_t_ { } uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); } - - void clearLocalSize() { Args.clearLocalSize(); } }; diff --git a/source/adapters/hip/ur_interface_loader.cpp b/source/adapters/hip/ur_interface_loader.cpp index 1454ddfdf1..f7ec09188f 100644 --- a/source/adapters/hip/ur_interface_loader.cpp +++ b/source/adapters/hip/ur_interface_loader.cpp @@ -174,6 +174,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite; pDdiTable->pfnEventsWait = urEnqueueEventsWait; pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier; + pDdiTable->pfnEventsWaitWithBarrierExt = urEnqueueEventsWaitWithBarrierExt; pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch; pDdiTable->pfnMemBufferCopy = urEnqueueMemBufferCopy; pDdiTable->pfnMemBufferCopyRect = urEnqueueMemBufferCopyRect; diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt index 6465ebaa51..05a33c1224 100644 --- a/source/adapters/level_zero/CMakeLists.txt +++ b/source/adapters/level_zero/CMakeLists.txt @@ -58,7 +58,7 @@ if(UR_BUILD_ADAPTER_L0) # 'utils' target from 'level-zero-loader' includes path which is prefixed # in the source directory, this breaks the installation of 'utils' target. set_target_properties(utils PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "") - install(TARGETS ur_umf LevelZeroLoader LevelZeroLoader-Headers ze_loader utils + install(TARGETS ur_umf LevelZeroLoader LevelZeroLoader-Headers ComputeRuntimeLevelZero-Headers ze_loader utils EXPORT ${PROJECT_NAME}-targets ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} @@ -109,6 +109,7 @@ if(UR_BUILD_ADAPTER_L0) ${PROJECT_NAME}::umf LevelZeroLoader LevelZeroLoader-Headers + ComputeRuntimeLevelZero-Headers ) target_include_directories(ur_adapter_level_zero PRIVATE @@ -203,6 +204,7 @@ if(UR_BUILD_ADAPTER_L0_V2) ${PROJECT_NAME}::umf LevelZeroLoader LevelZeroLoader-Headers + ComputeRuntimeLevelZero-Headers ) target_include_directories(ur_adapter_level_zero_v2 PRIVATE diff --git a/source/adapters/level_zero/adapter.cpp b/source/adapters/level_zero/adapter.cpp index 8995a5e25b..5d6583f3d4 100644 --- a/source/adapters/level_zero/adapter.cpp +++ b/source/adapters/level_zero/adapter.cpp @@ -256,6 +256,9 @@ Behavior Summary: */ ur_adapter_handle_t_::ur_adapter_handle_t_() : logger(logger::get_logger("level_zero")) { + ZeInitDriversResult = ZE_RESULT_ERROR_UNINITIALIZED; + ZeInitResult = ZE_RESULT_ERROR_UNINITIALIZED; + ZesResult = ZE_RESULT_ERROR_UNINITIALIZED; if (UrL0Debug & UR_L0_DEBUG_BASIC) { logger.setLegacySink(std::make_unique()); @@ -331,9 +334,8 @@ ur_adapter_handle_t_::ur_adapter_handle_t_() logger::debug("\nzeInit with flags value of {}\n", static_cast(L0InitFlags)); GlobalAdapter->ZeInitResult = ZE_CALL_NOCHECK(zeInit, (L0InitFlags)); - if (*GlobalAdapter->ZeInitResult != ZE_RESULT_SUCCESS) { - logger::debug("\nzeInit failed with {}\n", - *GlobalAdapter->ZeInitResult); + if (GlobalAdapter->ZeInitResult != ZE_RESULT_SUCCESS) { + logger::debug("\nzeInit failed with {}\n", GlobalAdapter->ZeInitResult); } bool useInitDrivers = false; @@ -376,17 +378,17 @@ ur_adapter_handle_t_::ur_adapter_handle_t_() ZE_CALL_NOCHECK(GlobalAdapter->initDriversFunctionPtr, (&GlobalAdapter->ZeInitDriversCount, nullptr, &GlobalAdapter->InitDriversDesc)); - if (*GlobalAdapter->ZeInitDriversResult == ZE_RESULT_SUCCESS) { + if (GlobalAdapter->ZeInitDriversResult == ZE_RESULT_SUCCESS) { GlobalAdapter->InitDriversSupported = true; } else { logger::debug("\nzeInitDrivers failed with {}\n", - *GlobalAdapter->ZeInitDriversResult); + GlobalAdapter->ZeInitDriversResult); } } } - if (*GlobalAdapter->ZeInitResult == ZE_RESULT_SUCCESS || - *GlobalAdapter->ZeInitDriversResult == ZE_RESULT_SUCCESS) { + if (GlobalAdapter->ZeInitResult == ZE_RESULT_SUCCESS || + GlobalAdapter->ZeInitDriversResult == ZE_RESULT_SUCCESS) { GlobalAdapter->ZeResult = ZE_RESULT_SUCCESS; } else { GlobalAdapter->ZeResult = ZE_RESULT_ERROR_UNINITIALIZED; @@ -450,7 +452,7 @@ ur_adapter_handle_t_::ur_adapter_handle_t_() GlobalAdapter->ZesResult = ZE_RESULT_ERROR_UNINITIALIZED; } - ur_result_t err = initPlatforms(platforms, *GlobalAdapter->ZesResult); + ur_result_t err = initPlatforms(platforms, GlobalAdapter->ZesResult); if (err == UR_RESULT_SUCCESS) { result = std::move(platforms); } else { @@ -653,6 +655,14 @@ ur_result_t urAdapterGetInfo(ur_adapter_handle_t, ur_adapter_info_t PropName, return ReturnValue(UR_ADAPTER_BACKEND_LEVEL_ZERO); case UR_ADAPTER_INFO_REFERENCE_COUNT: return ReturnValue(GlobalAdapter->RefCount.load()); + case UR_ADAPTER_INFO_VERSION: { +#ifdef UR_ADAPTER_LEVEL_ZERO_V2 + uint32_t adapterVersion = 2; +#else + uint32_t adapterVersion = 1; +#endif + return ReturnValue(adapterVersion); + } default: return UR_RESULT_ERROR_INVALID_ENUMERATION; } diff --git a/source/adapters/level_zero/adapter.hpp b/source/adapters/level_zero/adapter.hpp index 277d2334b9..c41f671d9b 100644 --- a/source/adapters/level_zero/adapter.hpp +++ b/source/adapters/level_zero/adapter.hpp @@ -39,10 +39,10 @@ struct ur_adapter_handle_t_ { uint32_t ZeInitDriversCount = 0; bool InitDriversSupported = false; - std::optional ZeInitDriversResult; - std::optional ZeInitResult; + ze_result_t ZeInitDriversResult; + ze_result_t ZeInitResult; + ze_result_t ZesResult; std::optional ZeResult; - std::optional ZesResult; ZeCache> PlatformCache; logger::Logger &logger; HMODULE processHandle = nullptr; diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 56c53b5331..eccdc5e4d2 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -476,21 +476,14 @@ void ur_exp_command_buffer_handle_t_::cleanupCommandBufferResources() { ur_exp_command_buffer_command_handle_t_:: ur_exp_command_buffer_command_handle_t_( - ur_exp_command_buffer_handle_t CommandBuffer, uint64_t CommandId, - uint32_t WorkDim, bool UserDefinedLocalSize, - ur_kernel_handle_t Kernel = nullptr) - : CommandBuffer(CommandBuffer), CommandId(CommandId), WorkDim(WorkDim), - UserDefinedLocalSize(UserDefinedLocalSize), Kernel(Kernel) { + ur_exp_command_buffer_handle_t CommandBuffer, uint64_t CommandId) + : CommandBuffer(CommandBuffer), CommandId(CommandId) { ur::level_zero::urCommandBufferRetainExp(CommandBuffer); - if (Kernel) - ur::level_zero::urKernelRetain(Kernel); } ur_exp_command_buffer_command_handle_t_:: ~ur_exp_command_buffer_command_handle_t_() { ur::level_zero::urCommandBufferReleaseExp(CommandBuffer); - if (Kernel) - ur::level_zero::urKernelRelease(Kernel); } void ur_exp_command_buffer_handle_t_::registerSyncPoint( @@ -527,6 +520,31 @@ ur_result_t ur_exp_command_buffer_handle_t_::getFenceForQueue( return UR_RESULT_SUCCESS; } +kernel_command_handle::kernel_command_handle( + ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, + uint64_t CommandId, uint32_t WorkDim, bool UserDefinedLocalSize, + uint32_t NumKernelAlternatives, ur_kernel_handle_t *KernelAlternatives) + : ur_exp_command_buffer_command_handle_t_(CommandBuffer, CommandId), + WorkDim(WorkDim), UserDefinedLocalSize(UserDefinedLocalSize), + Kernel(Kernel) { + // Add the default kernel to the list of valid kernels + ur::level_zero::urKernelRetain(Kernel); + ValidKernelHandles.insert(Kernel); + // Add alternative kernels if provided + if (KernelAlternatives) { + for (size_t i = 0; i < NumKernelAlternatives; i++) { + ur::level_zero::urKernelRetain(KernelAlternatives[i]); + ValidKernelHandles.insert(KernelAlternatives[i]); + } + } +} + +kernel_command_handle::~kernel_command_handle() { + for (const ur_kernel_handle_t &KernelHandle : ValidKernelHandles) { + ur::level_zero::urKernelRelease(KernelHandle); + } +} + namespace ur::level_zero { /** @@ -906,7 +924,8 @@ setKernelPendingArguments(ur_exp_command_buffer_handle_t CommandBuffer, ur_result_t createCommandHandle(ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, uint32_t WorkDim, - const size_t *LocalWorkSize, + const size_t *LocalWorkSize, uint32_t NumKernelAlternatives, + ur_kernel_handle_t *KernelAlternatives, ur_exp_command_buffer_command_handle_t &Command) { assert(CommandBuffer->IsUpdatable); @@ -923,14 +942,41 @@ createCommandHandle(ur_exp_command_buffer_handle_t CommandBuffer, ZE_MUTABLE_COMMAND_EXP_FLAG_GLOBAL_OFFSET; auto Platform = CommandBuffer->Context->getPlatform(); - ZE2UR_CALL(Platform->ZeMutableCmdListExt.zexCommandListGetNextCommandIdExp, - (CommandBuffer->ZeComputeCommandListTranslated, - &ZeMutableCommandDesc, &CommandId)); + if (NumKernelAlternatives > 0) { + ZeMutableCommandDesc.flags |= + ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_INSTRUCTION; + + std::vector TranslatedKernelHandles( + NumKernelAlternatives + 1, nullptr); + + // Translate main kernel first + ZE2UR_CALL(zelLoaderTranslateHandle, + (ZEL_HANDLE_KERNEL, Kernel->ZeKernel, + (void **)&TranslatedKernelHandles[0])); + + for (size_t i = 0; i < NumKernelAlternatives; i++) { + ZE2UR_CALL(zelLoaderTranslateHandle, + (ZEL_HANDLE_KERNEL, KernelAlternatives[i]->ZeKernel, + (void **)&TranslatedKernelHandles[i + 1])); + } + + ZE2UR_CALL(Platform->ZeMutableCmdListExt + .zexCommandListGetNextCommandIdWithKernelsExp, + (CommandBuffer->ZeComputeCommandListTranslated, + &ZeMutableCommandDesc, NumKernelAlternatives + 1, + TranslatedKernelHandles.data(), &CommandId)); + + } else { + ZE2UR_CALL(Platform->ZeMutableCmdListExt.zexCommandListGetNextCommandIdExp, + (CommandBuffer->ZeComputeCommandListTranslated, + &ZeMutableCommandDesc, &CommandId)); + } DEBUG_LOG(CommandId); try { - Command = new ur_exp_command_buffer_command_handle_t_( - CommandBuffer, CommandId, WorkDim, LocalWorkSize != nullptr, Kernel); + Command = new kernel_command_handle( + CommandBuffer, Kernel, CommandId, WorkDim, LocalWorkSize != nullptr, + NumKernelAlternatives, KernelAlternatives); } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } catch (...) { @@ -944,8 +990,7 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, uint32_t WorkDim, const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, - uint32_t /*numKernelAlternatives*/, - ur_kernel_handle_t * /*phKernelAlternatives*/, + uint32_t NumKernelAlternatives, ur_kernel_handle_t *KernelAlternatives, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, @@ -960,6 +1005,10 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( UR_ASSERT(!(Command && !CommandBuffer->IsUpdatable), UR_RESULT_ERROR_INVALID_OPERATION); + for (uint32_t i = 0; i < NumKernelAlternatives; ++i) { + UR_ASSERT(KernelAlternatives[i] != Kernel, UR_RESULT_ERROR_INVALID_VALUE); + } + // Lock automatically releases when this goes out of scope. std::scoped_lock Lock( Kernel->Mutex, Kernel->Program->Mutex, CommandBuffer->Mutex); @@ -983,18 +1032,21 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( ZE2UR_CALL(zeKernelSetGroupSize, (Kernel->ZeKernel, WG[0], WG[1], WG[2])); CommandBuffer->KernelsList.push_back(Kernel); + for (size_t i = 0; i < NumKernelAlternatives; i++) { + CommandBuffer->KernelsList.push_back(KernelAlternatives[i]); + } - // Increment the reference count of the Kernel and indicate that the Kernel - // is in use. Once the event has been signaled, the code in - // CleanupCompletedEvent(Event) will do a urKernelRelease to update the - // reference count on the kernel, using the kernel saved in CommandData. - UR_CALL(ur::level_zero::urKernelRetain(Kernel)); + ur::level_zero::urKernelRetain(Kernel); + // Retain alternative kernels if provided + for (size_t i = 0; i < NumKernelAlternatives; i++) { + ur::level_zero::urKernelRetain(KernelAlternatives[i]); + } if (Command) { UR_CALL(createCommandHandle(CommandBuffer, Kernel, WorkDim, LocalWorkSize, + NumKernelAlternatives, KernelAlternatives, *Command)); } - std::vector ZeEventList; ze_event_handle_t ZeLaunchEvent = nullptr; UR_CALL(createSyncPointAndGetZeEvents( @@ -1690,7 +1742,7 @@ ur_result_t urCommandBufferReleaseCommandExp( * @return UR_RESULT_SUCCESS or an error code on failure */ ur_result_t validateCommandDesc( - ur_exp_command_buffer_command_handle_t Command, + kernel_command_handle *Command, const ur_exp_command_buffer_update_kernel_launch_desc_t *CommandDesc) { auto CommandBuffer = Command->CommandBuffer; @@ -1699,9 +1751,14 @@ ur_result_t validateCommandDesc( ->mutableCommandFlags; logger::debug("Mutable features supported by device {}", SupportedFeatures); - // Kernel handle updates are not yet supported. - if (CommandDesc->hNewKernel && CommandDesc->hNewKernel != Command->Kernel) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + UR_ASSERT( + !CommandDesc->hNewKernel || + (SupportedFeatures & ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_INSTRUCTION), + UR_RESULT_ERROR_UNSUPPORTED_FEATURE); + // Check if the provided new kernel is in the list of valid alternatives. + if (CommandDesc->hNewKernel && + !Command->ValidKernelHandles.count(CommandDesc->hNewKernel)) { + return UR_RESULT_ERROR_INVALID_VALUE; } if (CommandDesc->newWorkDim != Command->WorkDim && @@ -1754,7 +1811,7 @@ ur_result_t validateCommandDesc( * @return UR_RESULT_SUCCESS or an error code on failure */ ur_result_t updateKernelCommand( - ur_exp_command_buffer_command_handle_t Command, + kernel_command_handle *Command, const ur_exp_command_buffer_update_kernel_launch_desc_t *CommandDesc) { // We need the created descriptors to live till the point when @@ -1769,12 +1826,29 @@ ur_result_t updateKernelCommand( const auto CommandBuffer = Command->CommandBuffer; const void *NextDesc = nullptr; + auto Platform = CommandBuffer->Context->getPlatform(); uint32_t Dim = CommandDesc->newWorkDim; size_t *NewGlobalWorkOffset = CommandDesc->pNewGlobalWorkOffset; size_t *NewLocalWorkSize = CommandDesc->pNewLocalWorkSize; size_t *NewGlobalWorkSize = CommandDesc->pNewGlobalWorkSize; + // Kernel handle must be updated first for a given CommandId if required + ur_kernel_handle_t NewKernel = CommandDesc->hNewKernel; + if (NewKernel && Command->Kernel != NewKernel) { + ze_kernel_handle_t ZeKernelTranslated = nullptr; + ZE2UR_CALL( + zelLoaderTranslateHandle, + (ZEL_HANDLE_KERNEL, NewKernel->ZeKernel, (void **)&ZeKernelTranslated)); + + ZE2UR_CALL(Platform->ZeMutableCmdListExt + .zexCommandListUpdateMutableCommandKernelsExp, + (CommandBuffer->ZeComputeCommandListTranslated, 1, + &Command->CommandId, &ZeKernelTranslated)); + // Set current kernel to be the new kernel + Command->Kernel = NewKernel; + } + // Check if a new global offset is provided. if (NewGlobalWorkOffset && Dim > 0) { auto MutableGroupOffestDesc = @@ -1973,7 +2047,6 @@ ur_result_t updateKernelCommand( MutableCommandDesc.pNext = NextDesc; MutableCommandDesc.flags = 0; - auto Platform = CommandBuffer->Context->getPlatform(); ZE2UR_CALL( Platform->ZeMutableCmdListExt.zexCommandListUpdateMutableCommandsExp, (CommandBuffer->ZeComputeCommandListTranslated, &MutableCommandDesc)); @@ -2009,18 +2082,22 @@ ur_result_t urCommandBufferUpdateKernelLaunchExp( const ur_exp_command_buffer_update_kernel_launch_desc_t *CommandDesc) { UR_ASSERT(Command->CommandBuffer->IsUpdatable, UR_RESULT_ERROR_INVALID_OPERATION); - UR_ASSERT(Command->Kernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + auto KernelCommandHandle = static_cast(Command); + + UR_ASSERT(KernelCommandHandle->Kernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); // Lock command, kernel and command buffer for update. std::scoped_lock Guard( - Command->Mutex, Command->CommandBuffer->Mutex, Command->Kernel->Mutex); + Command->Mutex, Command->CommandBuffer->Mutex, + KernelCommandHandle->Kernel->Mutex); UR_ASSERT(Command->CommandBuffer->IsFinalized, UR_RESULT_ERROR_INVALID_OPERATION); - UR_CALL(validateCommandDesc(Command, CommandDesc)); + UR_CALL(validateCommandDesc(KernelCommandHandle, CommandDesc)); UR_CALL(waitForOngoingExecution(Command->CommandBuffer)); - UR_CALL(updateKernelCommand(Command, CommandDesc)); + UR_CALL(updateKernelCommand(KernelCommandHandle, CommandDesc)); ZE2UR_CALL(zeCommandListClose, (Command->CommandBuffer->ZeComputeCommandList)); diff --git a/source/adapters/level_zero/command_buffer.hpp b/source/adapters/level_zero/command_buffer.hpp index 156e0e5c24..d069f301fb 100644 --- a/source/adapters/level_zero/command_buffer.hpp +++ b/source/adapters/level_zero/command_buffer.hpp @@ -145,18 +145,31 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { struct ur_exp_command_buffer_command_handle_t_ : public _ur_object { ur_exp_command_buffer_command_handle_t_(ur_exp_command_buffer_handle_t, - uint64_t, uint32_t, bool, - ur_kernel_handle_t); + uint64_t); - ~ur_exp_command_buffer_command_handle_t_(); + virtual ~ur_exp_command_buffer_command_handle_t_(); // Command-buffer of this command. ur_exp_command_buffer_handle_t CommandBuffer; - + // L0 command ID identifying this command uint64_t CommandId; +}; + +struct kernel_command_handle : public ur_exp_command_buffer_command_handle_t_ { + kernel_command_handle(ur_exp_command_buffer_handle_t CommandBuffer, + ur_kernel_handle_t Kernel, uint64_t CommandId, + uint32_t WorkDim, bool UserDefinedLocalSize, + uint32_t NumKernelAlternatives, + ur_kernel_handle_t *KernelAlternatives); + + ~kernel_command_handle(); + // Work-dimension the command was originally created with. uint32_t WorkDim; // Set to true if the user set the local work size on command creation. bool UserDefinedLocalSize; + // Currently active kernel handle ur_kernel_handle_t Kernel; + // Storage for valid kernel alternatives for this command. + std::unordered_set ValidKernelHandles; }; diff --git a/source/adapters/level_zero/common.cpp b/source/adapters/level_zero/common.cpp index f5d8b20014..da7f624013 100644 --- a/source/adapters/level_zero/common.cpp +++ b/source/adapters/level_zero/common.cpp @@ -11,6 +11,7 @@ #include "common.hpp" #include "logger/ur_logger.hpp" #include "usm.hpp" +#include ur_result_t ze2urResult(ze_result_t ZeResult) { if (ZeResult == ZE_RESULT_SUCCESS) @@ -330,6 +331,14 @@ template <> zes_structure_type_t getZesStructureType() { return ZES_STRUCTURE_TYPE_MEM_PROPERTIES; } +#ifdef ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME +template <> +ze_structure_type_t +getZeStructureType() { + return ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES; +} +#endif // ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME + // Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR thread_local ur_result_t ErrorMessageCode = UR_RESULT_SUCCESS; thread_local char ErrorMessage[MaxMessageSize]; diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp index 7c1c412ee4..4fd1db0933 100644 --- a/source/adapters/level_zero/context.cpp +++ b/source/adapters/level_zero/context.cpp @@ -565,18 +565,26 @@ ur_event_handle_t ur_context_handle_t_::getEventFromContextCache( bool HostVisible, bool WithProfiling, ur_device_handle_t Device, bool CounterBasedEventEnabled) { std::scoped_lock Lock(EventCacheMutex); - auto Cache = getEventCache(HostVisible, WithProfiling, Device); - if (Cache->empty()) + auto Cache = getEventCache(HostVisible, WithProfiling, Device, + CounterBasedEventEnabled); + if (Cache->empty()) { + logger::info("Cache empty (Host Visible: {}, Profiling: {}, Counter: {}, " + "Device: {})", + HostVisible, WithProfiling, CounterBasedEventEnabled, Device); return nullptr; + } auto It = Cache->begin(); ur_event_handle_t Event = *It; - if (Event->CounterBasedEventsEnabled != CounterBasedEventEnabled) { - return nullptr; - } Cache->erase(It); // We have to reset event before using it. Event->reset(); + + logger::info("Using {} event (Host Visible: {}, Profiling: {}, Counter: {}, " + "Device: {}) from cache {}", + Event, Event->HostVisibleEvent, Event->isProfilingEnabled(), + Event->CounterBasedEventsEnabled, Device, Cache); + return Event; } @@ -588,8 +596,13 @@ void ur_context_handle_t_::addEventToContextCache(ur_event_handle_t Event) { Device = Event->UrQueue->Device; } - auto Cache = getEventCache(Event->isHostVisible(), - Event->isProfilingEnabled(), Device); + auto Cache = + getEventCache(Event->isHostVisible(), Event->isProfilingEnabled(), Device, + Event->CounterBasedEventsEnabled); + logger::info("Inserting {} event (Host Visible: {}, Profiling: {}, Counter: " + "{}, Device: {}) into cache {}", + Event, Event->HostVisibleEvent, Event->isProfilingEnabled(), + Event->CounterBasedEventsEnabled, Device, Cache); Cache->emplace_back(Event); } diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp index 0d3b2846e2..470c4c4f35 100644 --- a/source/adapters/level_zero/context.hpp +++ b/source/adapters/level_zero/context.hpp @@ -169,15 +169,6 @@ struct ur_context_handle_t_ : _ur_object { // holding the current pool usage counts. ur_mutex ZeEventPoolCacheMutex; - // Mutex to control operations on event caches. - ur_mutex EventCacheMutex; - - // Caches for events. - using EventCache = std::vector>; - EventCache EventCaches{4}; - std::vector> - EventCachesDeviceMap{4}; - // Initialize the PI context. ur_result_t initialize(); @@ -313,36 +304,45 @@ struct ur_context_handle_t_ : _ur_object { ze_context_handle_t getZeHandle() const; private: + enum EventFlags { + EVENT_FLAG_HOST_VISIBLE = UR_BIT(0), + EVENT_FLAG_WITH_PROFILING = UR_BIT(1), + EVENT_FLAG_COUNTER = UR_BIT(2), + EVENT_FLAG_DEVICE = UR_BIT(3), // if set, subsequent bits are device id + MAX_EVENT_FLAG_BITS = + 4, // this is used as an offset for embedding device id + }; + + // Mutex to control operations on event caches. + ur_mutex EventCacheMutex; + + // Caches for events. + using EventCache = std::list; + std::vector EventCaches; + // Get the cache of events for a provided scope and profiling mode. - auto getEventCache(bool HostVisible, bool WithProfiling, - ur_device_handle_t Device) { + EventCache *getEventCache(bool HostVisible, bool WithProfiling, + ur_device_handle_t Device, bool Counter) { + + size_t index = 0; if (HostVisible) { - if (Device) { - auto EventCachesMap = - WithProfiling ? &EventCachesDeviceMap[0] : &EventCachesDeviceMap[1]; - if (EventCachesMap->find(Device) == EventCachesMap->end()) { - EventCaches.emplace_back(); - EventCachesMap->insert( - std::make_pair(Device, EventCaches.size() - 1)); - } - return &EventCaches[(*EventCachesMap)[Device]]; - } else { - return WithProfiling ? &EventCaches[0] : &EventCaches[1]; - } - } else { - if (Device) { - auto EventCachesMap = - WithProfiling ? &EventCachesDeviceMap[2] : &EventCachesDeviceMap[3]; - if (EventCachesMap->find(Device) == EventCachesMap->end()) { - EventCaches.emplace_back(); - EventCachesMap->insert( - std::make_pair(Device, EventCaches.size() - 1)); - } - return &EventCaches[(*EventCachesMap)[Device]]; - } else { - return WithProfiling ? &EventCaches[2] : &EventCaches[3]; - } + index |= EVENT_FLAG_HOST_VISIBLE; + } + if (WithProfiling) { + index |= EVENT_FLAG_WITH_PROFILING; } + if (Counter) { + index |= EVENT_FLAG_COUNTER; + } + if (Device) { + index |= EVENT_FLAG_DEVICE | (*Device->Id << MAX_EVENT_FLAG_BITS); + } + + if (index >= EventCaches.size()) { + EventCaches.resize(index + 1); + } + + return &EventCaches[index]; } }; diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index 865edebc08..99bb20d31a 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -1048,6 +1048,10 @@ ur_result_t urDeviceGetInfo( UpdateCapabilities |= UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET; } + if (supportsFlags(ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_INSTRUCTION)) { + UpdateCapabilities |= + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE; + } return ReturnValue(UpdateCapabilities); } case UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP: @@ -1153,6 +1157,30 @@ ur_result_t urDeviceGetInfo( return ReturnValue(true); case UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP: return ReturnValue(false); + case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP: { +#ifdef ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME + const auto ZeDeviceBlockArrayFlags = + Device->ZeDeviceBlockArrayProperties->flags; + + auto supportsFlags = + [&](ze_intel_device_block_array_exp_flags_t RequiredFlags) { + return (ZeDeviceBlockArrayFlags & RequiredFlags) == RequiredFlags; + }; + + ur_exp_device_2d_block_array_capability_flags_t BlockArrayCapabilities = 0; + if (supportsFlags(ZE_INTEL_DEVICE_EXP_FLAG_2D_BLOCK_LOAD)) { + BlockArrayCapabilities |= + UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_LOAD; + } + if (supportsFlags(ZE_INTEL_DEVICE_EXP_FLAG_2D_BLOCK_STORE)) { + BlockArrayCapabilities |= + UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_STORE; + } + return ReturnValue(BlockArrayCapabilities); +#else + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; +#endif + } default: logger::error("Unsupported ParamName in urGetDeviceInfo"); logger::error("ParamNameParamName={}(0x{})", ParamName, @@ -1584,6 +1612,17 @@ ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal, ZE_CALL_NOCHECK(zeDeviceGetProperties, (ZeDevice, &P)); }; +#ifdef ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME + ZeDeviceBlockArrayProperties.Compute = + [ZeDevice]( + ZeStruct &Properties) { + ze_device_properties_t P; + P.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; + P.pNext = &Properties; + ZE_CALL_NOCHECK(zeDeviceGetProperties, (ZeDevice, &P)); + }; +#endif // ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME + ImmCommandListUsed = this->useImmediateCommandLists(); uint32_t numQueueGroups = 0; diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp index 3d78a99b97..512a5ff714 100644 --- a/source/adapters/level_zero/device.hpp +++ b/source/adapters/level_zero/device.hpp @@ -18,14 +18,14 @@ #include #include +#include "adapters/level_zero/platform.hpp" +#include "common.hpp" +#include #include #include #include #include -#include "adapters/level_zero/platform.hpp" -#include "common.hpp" - enum EventsScope { // All events are created host-visible. AllHostVisible, @@ -224,6 +224,10 @@ struct ur_device_handle_t_ : _ur_object { ZeCache ZeGlobalMemSize; ZeCache> ZeDeviceMutableCmdListsProperties; +#ifdef ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME + ZeCache> + ZeDeviceBlockArrayProperties; +#endif // ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME // Map device bindless image offset to corresponding host image handle. std::unordered_map diff --git a/source/adapters/level_zero/platform.cpp b/source/adapters/level_zero/platform.cpp index 520b52a1c8..0237b62863 100644 --- a/source/adapters/level_zero/platform.cpp +++ b/source/adapters/level_zero/platform.cpp @@ -319,6 +319,22 @@ ur_result_t ur_platform_handle_t_::initialize() { ZeMutableCmdListExt.Supported |= ZeMutableCmdListExt.zexCommandListUpdateMutableCommandWaitEventsExp != nullptr; + ZeMutableCmdListExt.zexCommandListUpdateMutableCommandKernelsExp = + (ze_pfnCommandListUpdateMutableCommandKernelsExp_t) + ur_loader::LibLoader::getFunctionPtr( + GlobalAdapter->processHandle, + "zeCommandListUpdateMutableCommandKernelsExp"); + ZeMutableCmdListExt.Supported |= + ZeMutableCmdListExt.zexCommandListUpdateMutableCommandKernelsExp != + nullptr; + ZeMutableCmdListExt.zexCommandListGetNextCommandIdWithKernelsExp = + (ze_pfnCommandListGetNextCommandIdWithKernelsExp_t) + ur_loader::LibLoader::getFunctionPtr( + GlobalAdapter->processHandle, + "zeCommandListGetNextCommandIdWithKernelsExp"); + ZeMutableCmdListExt.Supported |= + ZeMutableCmdListExt.zexCommandListGetNextCommandIdWithKernelsExp != + nullptr; } else { ZeMutableCmdListExt.Supported |= (ZE_CALL_NOCHECK( @@ -353,6 +369,21 @@ ur_result_t ur_platform_handle_t_::initialize() { &ZeMutableCmdListExt .zexCommandListUpdateMutableCommandWaitEventsExp))) == 0); + ZeMutableCmdListExt.Supported &= + (ZE_CALL_NOCHECK( + zeDriverGetExtensionFunctionAddress, + (ZeDriver, "zeCommandListUpdateMutableCommandKernelsExp", + reinterpret_cast( + &ZeMutableCmdListExt + .zexCommandListUpdateMutableCommandKernelsExp))) == 0); + + ZeMutableCmdListExt.Supported &= + (ZE_CALL_NOCHECK( + zeDriverGetExtensionFunctionAddress, + (ZeDriver, "zeCommandListGetNextCommandIdWithKernelsExp", + reinterpret_cast( + &ZeMutableCmdListExt + .zexCommandListGetNextCommandIdWithKernelsExp))) == 0); } return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/platform.hpp b/source/adapters/level_zero/platform.hpp index 413bb5c48a..468c602b10 100644 --- a/source/adapters/level_zero/platform.hpp +++ b/source/adapters/level_zero/platform.hpp @@ -107,5 +107,11 @@ struct ur_platform_handle_t_ : public _ur_platform { ze_result_t (*zexCommandListUpdateMutableCommandWaitEventsExp)( ze_command_list_handle_t, uint64_t, uint32_t, ze_event_handle_t *) = nullptr; + ze_result_t (*zexCommandListUpdateMutableCommandKernelsExp)( + ze_command_list_handle_t, uint32_t, uint64_t *, + ze_kernel_handle_t *) = nullptr; + ze_result_t (*zexCommandListGetNextCommandIdWithKernelsExp)( + ze_command_list_handle_t, const ze_mutable_command_id_exp_desc_t *, + uint32_t, ze_kernel_handle_t *, uint64_t *) = nullptr; } ZeMutableCmdListExt; }; diff --git a/source/adapters/level_zero/v2/command_list_cache.cpp b/source/adapters/level_zero/v2/command_list_cache.cpp index 9e585b80af..be4cb813fd 100644 --- a/source/adapters/level_zero/v2/command_list_cache.cpp +++ b/source/adapters/level_zero/v2/command_list_cache.cpp @@ -13,15 +13,6 @@ #include "../device.hpp" -typedef struct _zex_intel_queue_copy_operations_offload_hint_exp_desc_t { - ze_structure_type_t stype; - const void *pNext; - ze_bool_t copyOffloadEnabled; -} zex_intel_queue_copy_operations_offload_hint_exp_desc_t; - -#define ZEX_INTEL_STRUCTURE_TYPE_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_PROPERTIES \ - (ze_structure_type_t)0x0003001B - template <> ze_structure_type_t getZeStructureType() { diff --git a/source/adapters/native_cpu/adapter.cpp b/source/adapters/native_cpu/adapter.cpp index 2b5b95ccd0..727c3e3dba 100644 --- a/source/adapters/native_cpu/adapter.cpp +++ b/source/adapters/native_cpu/adapter.cpp @@ -57,6 +57,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t, return ReturnValue(UR_ADAPTER_BACKEND_NATIVE_CPU); case UR_ADAPTER_INFO_REFERENCE_COUNT: return ReturnValue(Adapter.RefCount.load()); + case UR_ADAPTER_INFO_VERSION: + return ReturnValue(uint32_t{1}); default: return UR_RESULT_ERROR_INVALID_ENUMERATION; } diff --git a/source/adapters/native_cpu/device.cpp b/source/adapters/native_cpu/device.cpp index b7c454315f..69c8bfc784 100644 --- a/source/adapters/native_cpu/device.cpp +++ b/source/adapters/native_cpu/device.cpp @@ -366,6 +366,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: case UR_DEVICE_INFO_MEMORY_BUS_WIDTH: return UR_RESULT_ERROR_INVALID_VALUE; + case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP: + return ReturnValue( + static_cast(0)); case UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: { // Currently for Native CPU fences are implemented using OCK // builtins, so we have different capabilities than atomic operations diff --git a/source/adapters/native_cpu/ur_interface_loader.cpp b/source/adapters/native_cpu/ur_interface_loader.cpp index 94c6c4a03e..9717f020c3 100644 --- a/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/source/adapters/native_cpu/ur_interface_loader.cpp @@ -172,6 +172,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite; pDdiTable->pfnEventsWait = urEnqueueEventsWait; pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier; + pDdiTable->pfnEventsWaitWithBarrierExt = urEnqueueEventsWaitWithBarrierExt; pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch; pDdiTable->pfnMemBufferCopy = urEnqueueMemBufferCopy; pDdiTable->pfnMemBufferCopyRect = urEnqueueMemBufferCopyRect; diff --git a/source/adapters/opencl/adapter.cpp b/source/adapters/opencl/adapter.cpp index bf81f6bdaf..162bc59b6a 100644 --- a/source/adapters/opencl/adapter.cpp +++ b/source/adapters/opencl/adapter.cpp @@ -128,6 +128,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t, return ReturnValue(UR_ADAPTER_BACKEND_OPENCL); case UR_ADAPTER_INFO_REFERENCE_COUNT: return ReturnValue(adapter->RefCount.load()); + case UR_ADAPTER_INFO_VERSION: + return ReturnValue(uint32_t{1}); default: return UR_RESULT_ERROR_INVALID_ENUMERATION; } diff --git a/source/adapters/opencl/device.cpp b/source/adapters/opencl/device.cpp index 70559eb52e..b33d637a84 100644 --- a/source/adapters/opencl/device.cpp +++ b/source/adapters/opencl/device.cpp @@ -1093,10 +1093,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_GLOBAL_MEM_FREE: case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: case UR_DEVICE_INFO_MEMORY_BUS_WIDTH: - case UR_DEVICE_INFO_ASYNC_BARRIER: { + case UR_DEVICE_INFO_ASYNC_BARRIER: return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP: { + bool Is2DBlockIOSupported = false; + if (cl_adapter::checkDeviceExtensions( + cl_adapter::cast(hDevice), + {"cl_intel_subgroup_2d_block_io"}, + Is2DBlockIOSupported) != UR_RESULT_SUCCESS || + !Is2DBlockIOSupported) { + return ReturnValue( + static_cast(0)); + } + return ReturnValue(UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_LOAD | + UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_STORE); } - case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: { cl_device_id Dev = cl_adapter::cast(hDevice); size_t ExtSize = 0; diff --git a/source/adapters/opencl/ur_interface_loader.cpp b/source/adapters/opencl/ur_interface_loader.cpp index cba90ee152..46d2bf6cdd 100644 --- a/source/adapters/opencl/ur_interface_loader.cpp +++ b/source/adapters/opencl/ur_interface_loader.cpp @@ -174,6 +174,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite; pDdiTable->pfnEventsWait = urEnqueueEventsWait; pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier; + pDdiTable->pfnEventsWaitWithBarrierExt = urEnqueueEventsWaitWithBarrierExt; pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch; pDdiTable->pfnMemBufferCopy = urEnqueueMemBufferCopy; pDdiTable->pfnMemBufferCopyRect = urEnqueueMemBufferCopyRect; diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index df3fb95262..e24d987567 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -32,8 +32,11 @@ if (NOT DEFINED UMF_REPO) endif() if (NOT DEFINED UMF_TAG) - # main 28.10.2024: Merge pull request #832 ... - set(UMF_TAG 43e9af0f50b70ccb989f786243881035dd829203) + # special branch with cherry-picks for incoming pulldown + # contains UMF PRs: #866, #924, and #930 + # branch was based on commit: 3bae087c9a8c0cbed5bde40f0d5a2 + # umf-fixes-nov-pulldown: 25.11.2024: Disable libudev in hwloc builds + set(UMF_TAG a7b6152b7b095c88ddf34bc7d442eb4c2b3f74d6) endif() message(STATUS "Will fetch Unified Memory Framework from ${UMF_REPO}") diff --git a/source/common/umf_pools/disjoint_pool_config_parser.cpp b/source/common/umf_pools/disjoint_pool_config_parser.cpp index f1bb7cd40c..0e82072ae2 100644 --- a/source/common/umf_pools/disjoint_pool_config_parser.cpp +++ b/source/common/umf_pools/disjoint_pool_config_parser.cpp @@ -215,6 +215,8 @@ DisjointPoolAllConfigs parseDisjointPoolConfig(const std::string &config, } } + AllConfigs.EnableBuffers = EnableBuffers; + AllConfigs.limits = std::shared_ptr( umfDisjointPoolSharedLimitsCreate(MaxSize), umfDisjointPoolSharedLimitsDestroy); @@ -224,10 +226,6 @@ DisjointPoolAllConfigs parseDisjointPoolConfig(const std::string &config, Config.PoolTrace = trace; } - if (!EnableBuffers) { - return {}; - } - if (!trace) { return AllConfigs; } diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp index 589e449869..271d846990 100644 --- a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp +++ b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp @@ -932,7 +932,8 @@ ContextInfo::~ContextInfo() { assert(Result == UR_RESULT_SUCCESS); // check memory leaks - if (getAsanInterceptor()->isNormalExit()) { + if (getAsanInterceptor()->getOptions().DetectLeaks && + getAsanInterceptor()->isNormalExit()) { std::vector AllocInfos = getAsanInterceptor()->findAllocInfoByContext(Handle); for (const auto &It : AllocInfos) { diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp index d8dd11101c..926be1388e 100644 --- a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp +++ b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp @@ -310,6 +310,8 @@ class AsanInterceptor { ur_result_t registerSpirKernels(ur_program_handle_t Program); private: + // m_Options may be used in other places, place it at the top + AsanOptions m_Options; std::unordered_map> m_ContextMap; ur_shared_mutex m_ContextMapMutex; @@ -335,8 +337,6 @@ class AsanInterceptor { std::unique_ptr m_Quarantine; - AsanOptions m_Options; - std::unordered_set m_Adapters; ur_shared_mutex m_AdaptersMutex; diff --git a/source/loader/layers/sanitizer/asan/asan_options.cpp b/source/loader/layers/sanitizer/asan/asan_options.cpp index 5953a75c85..e889a4b539 100644 --- a/source/loader/layers/sanitizer/asan/asan_options.cpp +++ b/source/loader/layers/sanitizer/asan/asan_options.cpp @@ -88,6 +88,7 @@ AsanOptions::AsanOptions() { SetBoolOption("detect_locals", DetectLocals); SetBoolOption("detect_privates", DetectPrivates); SetBoolOption("print_stats", PrintStats); + SetBoolOption("detect_leaks", DetectLeaks); auto KV = OptionsEnvMap->find("quarantine_size_mb"); if (KV != OptionsEnvMap->end()) { diff --git a/source/loader/layers/sanitizer/asan/asan_options.hpp b/source/loader/layers/sanitizer/asan/asan_options.hpp index 1385fdf6e3..711232bba7 100644 --- a/source/loader/layers/sanitizer/asan/asan_options.hpp +++ b/source/loader/layers/sanitizer/asan/asan_options.hpp @@ -26,6 +26,7 @@ struct AsanOptions { bool DetectPrivates = true; bool PrintStats = false; bool DetectKernelArguments = true; + bool DetectLeaks = true; explicit AsanOptions(); }; diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index abf32dd82a..7f0a017fe9 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -182,7 +182,7 @@ __urdlllocal ur_result_t UR_APICALL urAdapterGetInfo( return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - if (UR_ADAPTER_INFO_REFERENCE_COUNT < propName) { + if (UR_ADAPTER_INFO_VERSION < propName) { return UR_RESULT_ERROR_INVALID_ENUMERATION; } @@ -518,7 +518,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetInfo( return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - if (UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP < propName) { + if (UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP < propName) { return UR_RESULT_ERROR_INVALID_ENUMERATION; } diff --git a/source/loader/loader.def.in b/source/loader/loader.def.in index b5c3bde6ea..a336da153d 100644 --- a/source/loader/loader.def.in +++ b/source/loader/loader.def.in @@ -312,6 +312,7 @@ EXPORTS urPrintExpCommandBufferUpdateMemobjArgDesc urPrintExpCommandBufferUpdatePointerArgDesc urPrintExpCommandBufferUpdateValueArgDesc + urPrintExpDevice_2dBlockArrayCapabilityFlags urPrintExpEnqueueExtFlags urPrintExpEnqueueExtProperties urPrintExpEnqueueNativeCommandFlags diff --git a/source/loader/loader.map.in b/source/loader/loader.map.in index 778a5da065..59a8a8d107 100644 --- a/source/loader/loader.map.in +++ b/source/loader/loader.map.in @@ -312,6 +312,7 @@ urPrintExpCommandBufferUpdateMemobjArgDesc; urPrintExpCommandBufferUpdatePointerArgDesc; urPrintExpCommandBufferUpdateValueArgDesc; + urPrintExpDevice_2dBlockArrayCapabilityFlags; urPrintExpEnqueueExtFlags; urPrintExpEnqueueExtProperties; urPrintExpEnqueueNativeCommandFlags; diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 9b3196bbba..5cdfb3c805 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -451,7 +451,7 @@ ur_result_t UR_APICALL urAdapterGetLastError( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hAdapter` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_ADAPTER_INFO_REFERENCE_COUNT < propName` +/// + `::UR_ADAPTER_INFO_VERSION < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -870,7 +870,7 @@ ur_result_t UR_APICALL urDeviceGetSelected( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP < propName` +/// + `::UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE diff --git a/source/loader/ur_print.cpp b/source/loader/ur_print.cpp index d8206edb3f..6b1cbfd5ee 100644 --- a/source/loader/ur_print.cpp +++ b/source/loader/ur_print.cpp @@ -879,6 +879,14 @@ ur_result_t urPrintUsmMigrationFlags(enum ur_usm_migration_flag_t value, return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintExpDevice_2dBlockArrayCapabilityFlags( + enum ur_exp_device_2d_block_array_capability_flag_t value, char *buffer, + const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << value; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintExpImageCopyFlags(enum ur_exp_image_copy_flag_t value, char *buffer, const size_t buff_size, size_t *out_size) { diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 324f08aa52..914203df6e 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -410,7 +410,7 @@ ur_result_t UR_APICALL urAdapterGetLastError( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hAdapter` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_ADAPTER_INFO_REFERENCE_COUNT < propName` +/// + `::UR_ADAPTER_INFO_VERSION < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -771,7 +771,7 @@ ur_result_t UR_APICALL urDeviceGetSelected( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP < propName` +/// + `::UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE diff --git a/test/adapters/level_zero/CMakeLists.txt b/test/adapters/level_zero/CMakeLists.txt index bfb02d37c2..8fe062b38b 100644 --- a/test/adapters/level_zero/CMakeLists.txt +++ b/test/adapters/level_zero/CMakeLists.txt @@ -29,6 +29,7 @@ if(UR_BUILD_ADAPTER_L0) target_link_libraries(test-adapter-level_zero PRIVATE LevelZeroLoader LevelZeroLoader-Headers + ComputeRuntimeLevelZero-Headers ) target_include_directories(test-adapter-level_zero PRIVATE diff --git a/test/adapters/level_zero/v2/CMakeLists.txt b/test/adapters/level_zero/v2/CMakeLists.txt index f53cf15256..df6b43c443 100644 --- a/test/adapters/level_zero/v2/CMakeLists.txt +++ b/test/adapters/level_zero/v2/CMakeLists.txt @@ -25,6 +25,7 @@ function(add_unittest name) ${PROJECT_NAME}::umf LevelZeroLoader LevelZeroLoader-Headers + ComputeRuntimeLevelZero-Headers ) endfunction() diff --git a/test/conformance/adapter/urAdapterGetInfo.cpp b/test/conformance/adapter/urAdapterGetInfo.cpp index 4dff3ce4dc..63c3cbfca1 100644 --- a/test/conformance/adapter/urAdapterGetInfo.cpp +++ b/test/conformance/adapter/urAdapterGetInfo.cpp @@ -20,12 +20,14 @@ struct urAdapterGetInfoTest : uur::runtime::urAdapterTest, std::unordered_map adapter_info_size_map = { {UR_ADAPTER_INFO_BACKEND, sizeof(ur_adapter_backend_t)}, + {UR_ADAPTER_INFO_VERSION, sizeof(uint32_t)}, {UR_ADAPTER_INFO_REFERENCE_COUNT, sizeof(uint32_t)}, }; INSTANTIATE_TEST_SUITE_P( urAdapterGetInfo, urAdapterGetInfoTest, - ::testing::Values(UR_ADAPTER_INFO_BACKEND, UR_ADAPTER_INFO_REFERENCE_COUNT), + ::testing::Values(UR_ADAPTER_INFO_BACKEND, UR_ADAPTER_INFO_VERSION, + UR_ADAPTER_INFO_REFERENCE_COUNT), [](const ::testing::TestParamInfo &info) { std::stringstream ss; ss << info.param; diff --git a/test/conformance/device/urDeviceGetInfo.cpp b/test/conformance/device/urDeviceGetInfo.cpp index e41cff97ed..23a2f7f237 100644 --- a/test/conformance/device/urDeviceGetInfo.cpp +++ b/test/conformance/device/urDeviceGetInfo.cpp @@ -115,7 +115,9 @@ static std::unordered_map device_info_size_map = { {UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP, sizeof(uint32_t)}, {UR_DEVICE_INFO_COMPONENT_DEVICES, sizeof(uint32_t)}, {UR_DEVICE_INFO_COMPOSITE_DEVICE, sizeof(ur_device_handle_t)}, - {UR_DEVICE_INFO_USM_POOL_SUPPORT, sizeof(ur_bool_t)}}; + {UR_DEVICE_INFO_USM_POOL_SUPPORT, sizeof(ur_bool_t)}, + {UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP, + sizeof(ur_exp_device_2d_block_array_capability_flags_t)}}; struct urDeviceGetInfoTest : uur::urAllDevicesTest, ::testing::WithParamInterface { @@ -237,7 +239,8 @@ INSTANTIATE_TEST_SUITE_P( UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP, // UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT, // UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS, // - UR_DEVICE_INFO_USM_POOL_SUPPORT // + UR_DEVICE_INFO_USM_POOL_SUPPORT, // + UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP // ), [](const ::testing::TestParamInfo &info) { std::stringstream ss; diff --git a/test/conformance/device_code/saxpy_usm_local_mem.cpp b/test/conformance/device_code/saxpy_usm_local_mem.cpp index 7ef17e59b5..c2bc3adc5e 100644 --- a/test/conformance/device_code/saxpy_usm_local_mem.cpp +++ b/test/conformance/device_code/saxpy_usm_local_mem.cpp @@ -15,15 +15,22 @@ int main() { uint32_t A = 42; sycl_queue.submit([&](sycl::handler &cgh) { - sycl::local_accessor local_mem(local_size, cgh); + sycl::local_accessor local_mem_A(local_size, cgh); + sycl::local_accessor local_mem_B(local_size * 2, cgh); + cgh.parallel_for( sycl::nd_range<1>{{array_size}, {local_size}}, [=](sycl::nd_item<1> itemId) { auto i = itemId.get_global_linear_id(); auto local_id = itemId.get_local_linear_id(); - local_mem[local_id] = i; - Z[i] = A * X[i] + Y[i] + local_mem[local_id] + - itemId.get_local_range(0); + + local_mem_A[local_id] = i; + local_mem_B[local_id * 2] = -i; + local_mem_B[(local_id * 2) + 1] = itemId.get_local_range(0); + + Z[i] = A * X[i] + Y[i] + local_mem_A[local_id] + + local_mem_B[local_id * 2] + + local_mem_B[(local_id * 2) + 1]; }); }); return 0; diff --git a/test/conformance/enqueue/enqueue_adapter_level_zero.match b/test/conformance/enqueue/enqueue_adapter_level_zero.match index 9394f1b0a0..4155859eaf 100644 --- a/test/conformance/enqueue/enqueue_adapter_level_zero.match +++ b/test/conformance/enqueue/enqueue_adapter_level_zero.match @@ -1,5 +1,7 @@ {{OPT}}urEnqueueEventsWaitTest.Success/* {{OPT}}urEnqueueKernelLaunchTest.InvalidKernelArgs/* +{{OPT}}urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest.Success/UseEventsNoQueuePerThread +{{OPT}}urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest.Success/NoUseEventsNoQueuePerThread {{OPT}}urEnqueueKernelLaunchKernelWgSizeTest.Success/* {{OPT}}urEnqueueKernelLaunchKernelSubGroupTest.Success/* {{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/*__copy_2d_3d @@ -18,7 +20,7 @@ {{OPT}}urEnqueueMemImageReadTest.InvalidOrigin1D/* {{OPT}}urEnqueueMemImageReadTest.InvalidOrigin2D/* {{OPT}}urEnqueueMemImageReadTest.InvalidOrigin3D/* -{{OPT}}urEnqueueEventsWaitMultiDeviceMTTest/* +{{OPT}}urEnqueueEventsWaitMultiDeviceMTTest* {{OPT}}urEnqueueEventsWaitWithBarrierOrderingTest.SuccessEventDependencies/* {{OPT}}urEnqueueEventsWaitWithBarrierOrderingTest.SuccessEventDependenciesBarrierOnly/* {{OPT}}urEnqueueEventsWaitWithBarrierOrderingTest.SuccessEventDependenciesLaunchOnly/* diff --git a/test/conformance/enqueue/urEnqueueEventsWaitWithBarrier.cpp b/test/conformance/enqueue/urEnqueueEventsWaitWithBarrier.cpp index 8cf6401211..ca464b48dd 100644 --- a/test/conformance/enqueue/urEnqueueEventsWaitWithBarrier.cpp +++ b/test/conformance/enqueue/urEnqueueEventsWaitWithBarrier.cpp @@ -4,9 +4,30 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include -struct urEnqueueEventsWaitWithBarrierTest : uur::urMultiQueueTest { +enum class BarrierType { + Normal, + ExtLowPower, +}; + +std::ostream &operator<<(std::ostream &os, BarrierType barrierType) { + switch (barrierType) { + case BarrierType::Normal: + os << "Normal"; + break; + case BarrierType::ExtLowPower: + os << "ExtLowPower"; + break; + default: + os << "Unknown"; + break; + } + return os; +} + +struct urEnqueueEventsWaitWithBarrierTest + : uur::urMultiQueueTestWithParam { void SetUp() override { - UUR_RETURN_ON_FATAL_FAILURE(urMultiQueueTest::SetUp()); + UUR_RETURN_ON_FATAL_FAILURE(urMultiQueueTestWithParam::SetUp()); ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_WRITE_ONLY, size, nullptr, &src_buffer)); ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_ONLY, size, @@ -24,7 +45,23 @@ struct urEnqueueEventsWaitWithBarrierTest : uur::urMultiQueueTest { if (dst_buffer) { EXPECT_SUCCESS(urMemRelease(dst_buffer)); } - urMultiQueueTest::TearDown(); + urMultiQueueTestWithParam::TearDown(); + } + + ur_result_t EnqueueBarrier(ur_queue_handle_t queue, uint32_t num_events, + const ur_event_handle_t *event_list, + ur_event_handle_t *wait_event) { + BarrierType barrier = getParam(); + if (barrier == BarrierType::ExtLowPower) { + struct ur_exp_enqueue_ext_properties_t props = { + UR_STRUCTURE_TYPE_EXP_ENQUEUE_EXT_PROPERTIES, nullptr, + UR_EXP_ENQUEUE_EXT_FLAG_LOW_POWER_EVENTS}; + return urEnqueueEventsWaitWithBarrierExt(queue, &props, num_events, + event_list, wait_event); + } + + return urEnqueueEventsWaitWithBarrier(queue, num_events, event_list, + wait_event); } const size_t count = 1024; @@ -34,7 +71,10 @@ struct urEnqueueEventsWaitWithBarrierTest : uur::urMultiQueueTest { std::vector input; }; -UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueEventsWaitWithBarrierTest); +UUR_TEST_SUITE_P(urEnqueueEventsWaitWithBarrierTest, + ::testing::Values(BarrierType::Normal, + BarrierType::ExtLowPower), + uur::deviceTestWithParamPrinter); struct urEnqueueEventsWaitWithBarrierOrderingTest : uur::urProgramTest { void SetUp() override { @@ -67,8 +107,7 @@ TEST_P(urEnqueueEventsWaitWithBarrierTest, Success) { ur_event_handle_t waitEvent = nullptr; ASSERT_SUCCESS(urEnqueueMemBufferCopy(queue1, src_buffer, dst_buffer, 0, 0, size, 0, nullptr, &event1)); - EXPECT_SUCCESS( - urEnqueueEventsWaitWithBarrier(queue2, 1, &event1, &waitEvent)); + EXPECT_SUCCESS(EnqueueBarrier(queue2, 1, &event1, &waitEvent)); EXPECT_SUCCESS(urQueueFlush(queue2)); EXPECT_SUCCESS(urQueueFlush(queue1)); EXPECT_SUCCESS(urEventWait(1, &waitEvent)); @@ -86,8 +125,7 @@ TEST_P(urEnqueueEventsWaitWithBarrierTest, Success) { input.data(), 0, nullptr, nullptr)); EXPECT_SUCCESS(urEnqueueMemBufferCopy(queue2, src_buffer, dst_buffer, 0, 0, size, 0, nullptr, &event2)); - EXPECT_SUCCESS( - urEnqueueEventsWaitWithBarrier(queue1, 1, &event2, &waitEvent)); + EXPECT_SUCCESS(EnqueueBarrier(queue1, 1, &event2, &waitEvent)); EXPECT_SUCCESS(urQueueFlush(queue2)); EXPECT_SUCCESS(urQueueFlush(queue1)); EXPECT_SUCCESS(urEventWait(1, &waitEvent)); @@ -99,27 +137,23 @@ TEST_P(urEnqueueEventsWaitWithBarrierTest, Success) { } TEST_P(urEnqueueEventsWaitWithBarrierTest, InvalidNullHandleQueue) { - ASSERT_EQ_RESULT( - UR_RESULT_ERROR_INVALID_NULL_HANDLE, - urEnqueueEventsWaitWithBarrier(nullptr, 0, nullptr, nullptr)); + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE, + EnqueueBarrier(nullptr, 0, nullptr, nullptr)); } TEST_P(urEnqueueEventsWaitWithBarrierTest, InvalidNullPtrEventWaitList) { - ASSERT_EQ_RESULT( - urEnqueueEventsWaitWithBarrier(queue1, 1, nullptr, nullptr), - UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + ASSERT_EQ_RESULT(EnqueueBarrier(queue1, 1, nullptr, nullptr), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); ur_event_handle_t validEvent; ASSERT_SUCCESS(urEnqueueEventsWait(queue1, 0, nullptr, &validEvent)); - ASSERT_EQ_RESULT( - urEnqueueEventsWaitWithBarrier(queue1, 0, &validEvent, nullptr), - UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + ASSERT_EQ_RESULT(EnqueueBarrier(queue1, 0, &validEvent, nullptr), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); ur_event_handle_t inv_evt = nullptr; - ASSERT_EQ_RESULT( - urEnqueueEventsWaitWithBarrier(queue1, 1, &inv_evt, nullptr), - UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + ASSERT_EQ_RESULT(EnqueueBarrier(queue1, 1, &inv_evt, nullptr), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); ASSERT_SUCCESS(urEventRelease(validEvent)); } diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match deleted file mode 100644 index 40182b9125..0000000000 --- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match +++ /dev/null @@ -1,11 +0,0 @@ -# Note: This file is only for use with cts_exe.py -# These cause SIGILL when built with -fsanitize=cfi on Nvidia -{{OPT}}urCommandBufferKernelHandleUpdateTest.Success/* -{{OPT}}urCommandBufferKernelHandleUpdateTest.UpdateAgain/* -{{OPT}}urCommandBufferKernelHandleUpdateTest.RestoreOriginalKernel/* -{{OPT}}urCommandBufferKernelHandleUpdateTest.KernelAlternativeNotRegistered/* -{{OPT}}urCommandBufferKernelHandleUpdateTest.RegisterInvalidKernelAlternative/* -{{OPT}}urCommandBufferValidUpdateParametersTest.UpdateDimensionsWithoutUpdatingKernel/* -{{OPT}}urCommandBufferValidUpdateParametersTest.UpdateOnlyLocalWorkSize/* -{{OPT}}urCommandBufferValidUpdateParametersTest.SuccessNullptrHandle/* -{{OPT}}KernelCommandEventSyncUpdateTest.TwoWaitEvents/* diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match deleted file mode 100644 index da8d6dee07..0000000000 --- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match +++ /dev/null @@ -1,10 +0,0 @@ -# Note: This file is only for use with cts_exe.py -# These cause SIGILL when built with -fsanitize=cfi on AMD -{{OPT}}urCommandBufferKernelHandleUpdateTest.Success/* -{{OPT}}urCommandBufferKernelHandleUpdateTest.UpdateAgain/* -{{OPT}}urCommandBufferKernelHandleUpdateTest.RestoreOriginalKernel/* -{{OPT}}urCommandBufferKernelHandleUpdateTest.KernelAlternativeNotRegistered/* -{{OPT}}urCommandBufferKernelHandleUpdateTest.RegisterInvalidKernelAlternative/* -{{OPT}}urCommandBufferValidUpdateParametersTest.UpdateDimensionsWithoutUpdatingKernel/* -{{OPT}}urCommandBufferValidUpdateParametersTest.UpdateOnlyLocalWorkSize/* -{{OPT}}urCommandBufferValidUpdateParametersTest.SuccessNullptrHandle/* diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match index c6fe7ad962..3588eaea82 100644 --- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match +++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match @@ -37,7 +37,11 @@ {{OPT}}KernelCommandEventSyncUpdateTest.TwoWaitEvents/* {{OPT}}KernelCommandEventSyncUpdateTest.InvalidWaitUpdate/* {{OPT}}KernelCommandEventSyncUpdateTest.InvalidSignalUpdate/* -{{OPT}}LocalMemoryUpdateTest.UpdateParameters/* -{{OPT}}LocalMemoryUpdateTest.UpdateParametersAndLocalSize/* +{{OPT}}LocalMemoryUpdateTest.UpdateParametersSameLocalSize/* +{{OPT}}LocalMemoryUpdateTest.UpdateLocalOnly/* +{{OPT}}LocalMemoryUpdateTest.UpdateParametersEmptyLocalSize/* +{{OPT}}LocalMemoryUpdateTest.UpdateParametersSmallerLocalSize/* +{{OPT}}LocalMemoryUpdateTest.UpdateParametersLargerLocalSize/* +{{OPT}}LocalMemoryUpdateTest.UpdateParametersPartialLocalSize/* {{OPT}}LocalMemoryMultiUpdateTest.UpdateParameters/* {{OPT}}LocalMemoryMultiUpdateTest.UpdateWithoutBlocking/* diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp index c295556fdb..c467c9783a 100644 --- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp +++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp @@ -8,8 +8,7 @@ #include // Test that updating a command-buffer with a single kernel command -// taking a local memory argument works correctly. - +// taking local memory arguments works correctly. struct LocalMemoryUpdateTestBase : uur::command_buffer::urUpdatableCommandBufferExpExecutionTest { virtual void SetUp() override { @@ -17,7 +16,13 @@ struct LocalMemoryUpdateTestBase UUR_RETURN_ON_FATAL_FAILURE( urUpdatableCommandBufferExpExecutionTest::SetUp()); - // HIP has extra args for local memory so we define an offset for arg indices here for updating + if (backend == UR_PLATFORM_BACKEND_LEVEL_ZERO) { + GTEST_SKIP() + << "Local memory argument update not supported on Level Zero."; + } + + // HIP has extra args for local memory so we define an offset for arg + // indices here for updating hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0; ur_device_usm_access_capability_flags_t shared_usm_flags; ASSERT_SUCCESS( @@ -38,33 +43,48 @@ struct LocalMemoryUpdateTestBase std::memcpy(shared_ptr, pattern.data(), allocation_size); } size_t current_index = 0; - // Index 0 is local_mem arg + // Index 0 is local_mem_a arg ASSERT_SUCCESS(urKernelSetArgLocal(kernel, current_index++, - local_mem_size, nullptr)); + local_mem_a_size, nullptr)); + + // Hip has extra args for local mem at index 1-3 + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, + sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, + sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, + sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + } - //Hip has extr args for local mem at index 1-3 + // Index 1 is local_mem_b arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, current_index++, + local_mem_b_size, nullptr)); if (backend == UR_PLATFORM_BACKEND_HIP) { ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, - sizeof(local_size), nullptr, - &local_size)); + sizeof(hip_local_offset), + nullptr, &hip_local_offset)); ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, - sizeof(local_size), nullptr, - &local_size)); + sizeof(hip_local_offset), + nullptr, &hip_local_offset)); ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, - sizeof(local_size), nullptr, - &local_size)); + sizeof(hip_local_offset), + nullptr, &hip_local_offset)); } - // Index 1 is output + // Index 2 is output ASSERT_SUCCESS(urKernelSetArgPointer(kernel, current_index++, nullptr, shared_ptrs[0])); - // Index 2 is A + // Index 3 is A ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, sizeof(A), nullptr, &A)); - // Index 3 is X + // Index 4 is X ASSERT_SUCCESS(urKernelSetArgPointer(kernel, current_index++, nullptr, shared_ptrs[1])); - // Index 4 is Y + // Index 5 is Y ASSERT_SUCCESS(urKernelSetArgPointer(kernel, current_index++, nullptr, shared_ptrs[2])); } @@ -72,7 +92,7 @@ struct LocalMemoryUpdateTestBase void Validate(uint32_t *output, uint32_t *X, uint32_t *Y, uint32_t A, size_t length, size_t local_size) { for (size_t i = 0; i < length; i++) { - uint32_t result = A * X[i] + Y[i] + i + local_size; + uint32_t result = A * X[i] + Y[i] + local_size; ASSERT_EQ(result, output[i]); } } @@ -89,7 +109,8 @@ struct LocalMemoryUpdateTestBase } static constexpr size_t local_size = 4; - static constexpr size_t local_mem_size = local_size * sizeof(uint32_t); + static constexpr size_t local_mem_a_size = local_size * sizeof(uint32_t); + static constexpr size_t local_mem_b_size = local_mem_a_size * 2; static constexpr size_t global_size = 16; static constexpr size_t global_offset = 0; static constexpr size_t n_dimensions = 1; @@ -98,6 +119,7 @@ struct LocalMemoryUpdateTestBase nullptr}; uint32_t hip_arg_offset = 0; + static constexpr uint64_t hip_local_offset = 0; }; struct LocalMemoryUpdateTest : LocalMemoryUpdateTestBase { @@ -127,7 +149,9 @@ struct LocalMemoryUpdateTest : LocalMemoryUpdateTestBase { UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(LocalMemoryUpdateTest); -TEST_P(LocalMemoryUpdateTest, UpdateParameters) { +// Test updating A,X,Y parameters to new values and local memory parameters +// to original values. +TEST_P(LocalMemoryUpdateTest, UpdateParametersSameLocalSize) { // Run command-buffer prior to update an verify output ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, nullptr, nullptr)); @@ -139,63 +163,218 @@ TEST_P(LocalMemoryUpdateTest, UpdateParameters) { Validate(output, X, Y, A, global_size, local_size); // Update inputs - ur_exp_command_buffer_update_pointer_arg_desc_t new_input_descs[2]; - ur_exp_command_buffer_update_value_arg_desc_t new_value_descs[2]; + std::array + new_input_descs; + std::array + new_value_descs; - // New local_mem at index 0 + // New local_mem_a at index 0 new_value_descs[0] = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype nullptr, // pNext 0, // argIndex - local_mem_size, // argSize + local_mem_a_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }; + + // New local_mem_b at index 1 + new_value_descs[1] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 1 + hip_arg_offset, // argIndex + local_mem_b_size, // argSize nullptr, // pProperties nullptr, // hArgValue }; - // New A at index 2 + // New A at index 3 uint32_t new_A = 33; + new_value_descs[2] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 3 + (2 * hip_arg_offset), // argIndex + sizeof(new_A), // argSize + nullptr, // pProperties + &new_A, // hArgValue + }; + + // New X at index 4 + new_input_descs[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 4 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[3], // pArgValue + }; + + // New Y at index 5 + new_input_descs[1] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 5 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[4], // pArgValue + }; + + // Update kernel inputs + ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + kernel, // hNewKernel + 0, // numNewMemObjArgs + new_input_descs.size(), // numNewPointerArgs + new_value_descs.size(), // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + new_input_descs.data(), // pNewPointerArgList + new_value_descs.data(), // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + nullptr, // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize + }; + + // Update kernel and enqueue command-buffer again + ASSERT_SUCCESS( + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + // Verify that update occurred correctly + uint32_t *new_output = (uint32_t *)shared_ptrs[0]; + uint32_t *new_X = (uint32_t *)shared_ptrs[3]; + uint32_t *new_Y = (uint32_t *)shared_ptrs[4]; + Validate(new_output, new_X, new_Y, new_A, global_size, local_size); +} + +// Test only passing local memory parameters to update with the original values. +TEST_P(LocalMemoryUpdateTest, UpdateLocalOnly) { + // Run command-buffer prior to update an verify output + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size, local_size); + + // Update inputs + std::array + new_value_descs; + + // New local_mem_a at index 0 + new_value_descs[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 0, // argIndex + local_mem_a_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }; + + // New local_mem_b at index 1 new_value_descs[1] = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype nullptr, // pNext - 2 + hip_arg_offset, // argIndex + 1 + hip_arg_offset, // argIndex + local_mem_b_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }; + + // Update kernel inputs + ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + kernel, // hNewKernel + 0, // numNewMemObjArgs + 0, // numNewPointerArgs + new_value_descs.size(), // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + nullptr, // pNewPointerArgList + new_value_descs.data(), // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + nullptr, // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize + }; + + // Update kernel and enqueue command-buffer again + ASSERT_SUCCESS( + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + // Verify that update occurred correctly + Validate(output, X, Y, A, global_size, local_size); +} + +// Test updating A,X,Y parameters to new values and omitting local memory parameters +// from the update. +TEST_P(LocalMemoryUpdateTest, UpdateParametersEmptyLocalSize) { + // Run command-buffer prior to update and verify output + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size, local_size); + + // Update inputs + std::array + new_input_descs; + std::array + new_value_descs; + + // New A at index 3 + uint32_t new_A = 33; + new_value_descs[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 3 + (2 * hip_arg_offset), // argIndex sizeof(new_A), // argSize nullptr, // pProperties &new_A, // hArgValue }; - // New X at index 3 + // New X at index 4 new_input_descs[0] = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype nullptr, // pNext - 3 + hip_arg_offset, // argIndex - nullptr, // pProperties - &shared_ptrs[3], // pArgValue + 4 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[3], // pArgValue }; - // New Y at index 4 + // New Y at index 5 new_input_descs[1] = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype nullptr, // pNext - 4 + hip_arg_offset, // argIndex - nullptr, // pProperties - &shared_ptrs[4], // pArgValue + 5 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[4], // pArgValue }; // Update kernel inputs ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext - kernel, // hNewKernel - 0, // numNewMemObjArgs - 2, // numNewPointerArgs - 2, // numNewValueArgs - n_dimensions, // newWorkDim - nullptr, // pNewMemObjArgList - new_input_descs, // pNewPointerArgList - new_value_descs, // pNewValueArgList - nullptr, // pNewGlobalWorkOffset - nullptr, // pNewGlobalWorkSize - nullptr, // pNewLocalWorkSize + kernel, // hNewKernel + 0, // numNewMemObjArgs + new_input_descs.size(), // numNewPointerArgs + new_value_descs.size(), // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + new_input_descs.data(), // pNewPointerArgList + new_value_descs.data(), // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + nullptr, // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize }; // Update kernel and enqueue command-buffer again @@ -212,7 +391,9 @@ TEST_P(LocalMemoryUpdateTest, UpdateParameters) { Validate(new_output, new_X, new_Y, new_A, global_size, local_size); } -TEST_P(LocalMemoryUpdateTest, UpdateParametersAndLocalSize) { +// Test updating A,X,Y parameters to new values and local memory parameters +// to new smaller values. +TEST_P(LocalMemoryUpdateTest, UpdateParametersSmallerLocalSize) { // Run command-buffer prior to update an verify output ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, nullptr, nullptr)); @@ -228,14 +409,14 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersAndLocalSize) { std::vector new_value_descs{}; - size_t new_local_size = local_size * 2; - size_t new_local_mem_size = new_local_size * sizeof(uint32_t); - // New local_mem at index 0 + size_t new_local_size = 2; + size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t); + // New local_mem_a at index 0 new_value_descs.push_back({ UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype nullptr, // pNext 0, // argIndex - new_local_mem_size, // argSize + new_local_mem_a_size, // argSize nullptr, // pProperties nullptr, // hArgValue }); @@ -244,56 +425,94 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersAndLocalSize) { new_value_descs.push_back({ UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype nullptr, // pNext - 1, // argIndex - sizeof(new_local_size), // argSize - nullptr, // pProperties - &new_local_size, // hArgValue + 1, // argIndex + sizeof(hip_local_offset), // argSize + nullptr, // pProperties + &hip_local_offset, // hArgValue }); new_value_descs.push_back({ UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype nullptr, // pNext - 2, // argIndex - sizeof(new_local_size), // argSize - nullptr, // pProperties - &new_local_size, // hArgValue + 2, // argIndex + sizeof(hip_local_offset), // argSize + nullptr, // pProperties + &hip_local_offset, // hArgValue }); new_value_descs.push_back({ UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype nullptr, // pNext - 3, // argIndex - sizeof(new_local_size), // argSize - nullptr, // pProperties - &new_local_size, // hArgValue + 3, // argIndex + sizeof(hip_local_offset), // argSize + nullptr, // pProperties + &hip_local_offset, // hArgValue }); } - // New A at index 2 + // New local_mem_b at index 1 + size_t new_local_mem_b_size = new_local_size * sizeof(uint32_t) * 2; + new_value_descs.push_back({ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 1 + hip_arg_offset, // argIndex + new_local_mem_b_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }); + + if (backend == UR_PLATFORM_BACKEND_HIP) { + new_value_descs.push_back({ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 5, // argIndex + sizeof(hip_local_offset), // argSize + nullptr, // pProperties + &hip_local_offset, // hArgValue + }); + new_value_descs.push_back({ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 6, // argIndex + sizeof(hip_local_offset), // argSize + nullptr, // pProperties + &hip_local_offset, // hArgValue + }); + new_value_descs.push_back({ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 7, // argIndex + sizeof(hip_local_offset), // argSize + nullptr, // pProperties + &hip_local_offset, // hArgValue + }); + } + + // New A at index 3 uint32_t new_A = 33; new_value_descs.push_back({ UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype nullptr, // pNext - 2 + hip_arg_offset, // argIndex + 3 + (2 * hip_arg_offset), // argIndex sizeof(new_A), // argSize nullptr, // pProperties &new_A, // hArgValue }); - // New X at index 3 + // New X at index 4 new_input_descs[0] = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype nullptr, // pNext - 3 + hip_arg_offset, // argIndex - nullptr, // pProperties - &shared_ptrs[3], // pArgValue + 4 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[3], // pArgValue }; - // New Y at index 4 + // New Y at index 5 new_input_descs[1] = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype nullptr, // pNext - 4 + hip_arg_offset, // argIndex - nullptr, // pProperties - &shared_ptrs[4], // pArgValue + 5 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[4], // pArgValue }; // Update kernel inputs @@ -327,16 +546,345 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersAndLocalSize) { Validate(new_output, new_X, new_Y, new_A, global_size, new_local_size); } +// Test updating A,X,Y parameters to new values and local memory parameters +// to new larger values. +TEST_P(LocalMemoryUpdateTest, UpdateParametersLargerLocalSize) { + // Run command-buffer prior to update and verify output + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size, local_size); + + // Update inputs + ur_exp_command_buffer_update_pointer_arg_desc_t new_input_descs[2]; + std::vector + new_value_descs{}; + + size_t new_local_size = local_size * 4; + size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t); + // New local_mem_a at index 0 + new_value_descs.push_back({ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 0, // argIndex + new_local_mem_a_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }); + + if (backend == UR_PLATFORM_BACKEND_HIP) { + new_value_descs.push_back({ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 1, // argIndex + sizeof(hip_local_offset), // argSize + nullptr, // pProperties + &hip_local_offset, // hArgValue + }); + new_value_descs.push_back({ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 2, // argIndex + sizeof(hip_local_offset), // argSize + nullptr, // pProperties + &hip_local_offset, // hArgValue + }); + new_value_descs.push_back({ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 3, // argIndex + sizeof(hip_local_offset), // argSize + nullptr, // pProperties + &hip_local_offset, // hArgValue + }); + } + + // New local_mem_b at index 1 + size_t new_local_mem_b_size = new_local_size * sizeof(uint32_t) * 2; + new_value_descs.push_back({ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 1 + hip_arg_offset, // argIndex + new_local_mem_b_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }); + + if (backend == UR_PLATFORM_BACKEND_HIP) { + new_value_descs.push_back({ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 5, // argIndex + sizeof(hip_local_offset), // argSize + nullptr, // pProperties + &hip_local_offset, // hArgValue + }); + new_value_descs.push_back({ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 6, // argIndex + sizeof(hip_local_offset), // argSize + nullptr, // pProperties + &hip_local_offset, // hArgValue + }); + new_value_descs.push_back({ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 7, // argIndex + sizeof(hip_local_offset), // argSize + nullptr, // pProperties + &hip_local_offset, // hArgValue + }); + } + + // New A at index 3 + uint32_t new_A = 33; + new_value_descs.push_back({ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 3 + (2 * hip_arg_offset), // argIndex + sizeof(new_A), // argSize + nullptr, // pProperties + &new_A, // hArgValue + }); + + // New X at index 4 + new_input_descs[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 4 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[3], // pArgValue + }; + + // New Y at index 5 + new_input_descs[1] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 5 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[4], // pArgValue + }; + + // Update kernel inputs + ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + kernel, // hNewKernel + 0, // numNewMemObjArgs + 2, // numNewPointerArgs + static_cast(new_value_descs.size()), // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + new_input_descs, // pNewPointerArgList + new_value_descs.data(), // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + nullptr, // pNewGlobalWorkSize + &new_local_size, // pNewLocalWorkSize + }; + + // Update kernel and enqueue command-buffer again + ASSERT_SUCCESS( + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + // Verify that update occurred correctly + uint32_t *new_output = (uint32_t *)shared_ptrs[0]; + uint32_t *new_X = (uint32_t *)shared_ptrs[3]; + uint32_t *new_Y = (uint32_t *)shared_ptrs[4]; + Validate(new_output, new_X, new_Y, new_A, global_size, new_local_size); +} + +// Test updating A,X,Y parameters to new values and only one of the local memory +// parameters, which is set to a new value. Then a separate update call for +// the other local memory argument. +TEST_P(LocalMemoryUpdateTest, UpdateParametersPartialLocalSize) { + // Run command-buffer prior to update and verify output + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size, local_size); + + // Update inputs + ur_exp_command_buffer_update_pointer_arg_desc_t new_input_descs[2]; + std::vector + new_value_descs{}; + + size_t new_local_size = local_size * 4; + size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t); + // New local_mem_a at index 0 + new_value_descs.push_back({ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 0, // argIndex + new_local_mem_a_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }); + + if (backend == UR_PLATFORM_BACKEND_HIP) { + new_value_descs.push_back({ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 1, // argIndex + sizeof(hip_local_offset), // argSize + nullptr, // pProperties + &hip_local_offset, // hArgValue + }); + new_value_descs.push_back({ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 2, // argIndex + sizeof(hip_local_offset), // argSize + nullptr, // pProperties + &hip_local_offset, // hArgValue + }); + new_value_descs.push_back({ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 3, // argIndex + sizeof(hip_local_offset), // argSize + nullptr, // pProperties + &hip_local_offset, // hArgValue + }); + } + + // New A at index 3 + uint32_t new_A = 33; + new_value_descs.push_back({ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 3 + (2 * hip_arg_offset), // argIndex + sizeof(new_A), // argSize + nullptr, // pProperties + &new_A, // hArgValue + }); + + // New X at index 4 + new_input_descs[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 4 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[3], // pArgValue + }; + + // New Y at index 5 + new_input_descs[1] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 5 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[4], // pArgValue + }; + + // Update kernel inputs + ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + kernel, // hNewKernel + 0, // numNewMemObjArgs + 2, // numNewPointerArgs + static_cast(new_value_descs.size()), // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + new_input_descs, // pNewPointerArgList + new_value_descs.data(), // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + nullptr, // pNewGlobalWorkSize + &new_local_size, // pNewLocalWorkSize + }; + + // Update kernel and enqueue command-buffer again + ASSERT_SUCCESS( + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); + + std::vector + second_update_value_args{}; + + size_t new_local_mem_b_size = new_local_size * sizeof(uint32_t) * 2; + // New local_mem_b at index 1 + second_update_value_args.push_back({ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 1 + hip_arg_offset, // argIndex + new_local_mem_b_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }); + + if (backend == UR_PLATFORM_BACKEND_HIP) { + second_update_value_args.push_back({ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 5, // argIndex + sizeof(hip_local_offset), // argSize + nullptr, // pProperties + &hip_local_offset, // hArgValue + }); + second_update_value_args.push_back({ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 6, // argIndex + sizeof(hip_local_offset), // argSize + nullptr, // pProperties + &hip_local_offset, // hArgValue + }); + second_update_value_args.push_back({ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 7, // argIndex + sizeof(hip_local_offset), // argSize + nullptr, // pProperties + &hip_local_offset, // hArgValue + }); + } + + ur_exp_command_buffer_update_kernel_launch_desc_t second_update_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + kernel, // hNewKernel + 0, // numNewMemObjArgs + 0, // numNewPointerArgs + static_cast( + second_update_value_args.size()), // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + nullptr, // pNewPointerArgList + second_update_value_args.data(), // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + nullptr, // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize + }; + ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp(command_handle, + &second_update_desc)); + + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + // Verify that update occurred correctly + uint32_t *new_output = (uint32_t *)shared_ptrs[0]; + uint32_t *new_X = (uint32_t *)shared_ptrs[3]; + uint32_t *new_Y = (uint32_t *)shared_ptrs[4]; + Validate(new_output, new_X, new_Y, new_A, global_size, new_local_size); +} + struct LocalMemoryMultiUpdateTest : LocalMemoryUpdateTestBase { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(LocalMemoryUpdateTestBase::SetUp()); - // Append kernel command to command-buffer and close command-buffer for (unsigned node = 0; node < nodes; node++) { - // We need to set the local memory arg each time because it is - // cleared in the kernel handle after being used. - ASSERT_SUCCESS( - urKernelSetArgLocal(kernel, 0, local_mem_size, nullptr)); ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr, @@ -363,6 +911,8 @@ struct LocalMemoryMultiUpdateTest : LocalMemoryUpdateTestBase { UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(LocalMemoryMultiUpdateTest); +// Test updating A,X,Y parameters to new values and local memory parameters +// to original values. TEST_P(LocalMemoryMultiUpdateTest, UpdateParameters) { // Run command-buffer prior to update an verify output ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, @@ -375,63 +925,75 @@ TEST_P(LocalMemoryMultiUpdateTest, UpdateParameters) { Validate(output, X, Y, A, global_size, local_size); // Update inputs - ur_exp_command_buffer_update_pointer_arg_desc_t new_input_descs[2]; - ur_exp_command_buffer_update_value_arg_desc_t new_value_descs[2]; + std::array + new_input_descs; + std::array + new_value_descs; - // New local_mem at index 0 + // New local_mem_a at index 0 new_value_descs[0] = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype nullptr, // pNext 0, // argIndex - local_mem_size, // argSize + local_mem_a_size, // argSize nullptr, // pProperties nullptr, // hArgValue }; - // New A at index 2 - uint32_t new_A = 33; + // New local_mem_b at index 1 new_value_descs[1] = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype nullptr, // pNext - 2 + hip_arg_offset, // argIndex + 1 + hip_arg_offset, // argIndex + local_mem_b_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }; + + // New A at index 3 + uint32_t new_A = 33; + new_value_descs[2] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 3 + (2 * hip_arg_offset), // argIndex sizeof(new_A), // argSize nullptr, // pProperties &new_A, // hArgValue }; - // New X at index 3 + // New X at index 4 new_input_descs[0] = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype nullptr, // pNext - 3 + hip_arg_offset, // argIndex - nullptr, // pProperties - &shared_ptrs[3], // pArgValue + 4 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[3], // pArgValue }; - // New Y at index 4 + // New Y at index 5 new_input_descs[1] = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype nullptr, // pNext - 4 + hip_arg_offset, // argIndex - nullptr, // pProperties - &shared_ptrs[4], // pArgValue + 5 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[4], // pArgValue }; // Update kernel inputs ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext - kernel, // hNewKernel - 0, // numNewMemObjArgs - 2, // numNewPointerArgs - 2, // numNewValueArgs - n_dimensions, // newWorkDim - nullptr, // pNewMemObjArgList - new_input_descs, // pNewPointerArgList - new_value_descs, // pNewValueArgList - nullptr, // pNewGlobalWorkOffset - nullptr, // pNewGlobalWorkSize - nullptr, // pNewLocalWorkSize + kernel, // hNewKernel + 0, // numNewMemObjArgs + new_input_descs.size(), // numNewPointerArgs + new_value_descs.size(), // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + new_input_descs.data(), // pNewPointerArgList + new_value_descs.data(), // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + nullptr, // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize }; // Update kernel and enqueue command-buffer again @@ -450,65 +1012,79 @@ TEST_P(LocalMemoryMultiUpdateTest, UpdateParameters) { Validate(new_output, new_X, new_Y, new_A, global_size, local_size); } +// Test updating A,X,Y parameters to new values and local memory parameters +// to original values, but without doing a blocking wait. TEST_P(LocalMemoryMultiUpdateTest, UpdateWithoutBlocking) { // Update inputs - ur_exp_command_buffer_update_pointer_arg_desc_t new_input_descs[2]; - ur_exp_command_buffer_update_value_arg_desc_t new_value_descs[2]; + std::array + new_input_descs; + std::array + new_value_descs; - // New local_mem at index 0 + // New local_mem_a at index 0 new_value_descs[0] = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype nullptr, // pNext 0, // argIndex - local_mem_size, // argSize + local_mem_a_size, // argSize nullptr, // pProperties nullptr, // hArgValue }; - // New A at index 2 - uint32_t new_A = 33; + // New local_mem_a at index 1 new_value_descs[1] = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype nullptr, // pNext - 2 + hip_arg_offset, // argIndex + 1 + hip_arg_offset, // argIndex + local_mem_b_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }; + + // New A at index 3 + uint32_t new_A = 33; + new_value_descs[2] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 3 + (2 * hip_arg_offset), // argIndex sizeof(new_A), // argSize nullptr, // pProperties &new_A, // hArgValue }; - // New X at index 3 + // New X at index 4 new_input_descs[0] = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype nullptr, // pNext - 3 + hip_arg_offset, // argIndex - nullptr, // pProperties - &shared_ptrs[3], // pArgValue + 4 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[3], // pArgValue }; - // New Y at index 4 + // New Y at index 5 new_input_descs[1] = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype nullptr, // pNext - 4 + hip_arg_offset, // argIndex - nullptr, // pProperties - &shared_ptrs[4], // pArgValue + 5 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[4], // pArgValue }; // Update kernel inputs ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext - kernel, // hNewKernel - 0, // numNewMemObjArgs - 2, // numNewPointerArgs - 2, // numNewValueArgs - n_dimensions, // newWorkDim - nullptr, // pNewMemObjArgList - new_input_descs, // pNewPointerArgList - new_value_descs, // pNewValueArgList - nullptr, // pNewGlobalWorkOffset - nullptr, // pNewGlobalWorkSize - nullptr, // pNewLocalWorkSize + kernel, // hNewKernel + 0, // numNewMemObjArgs + new_input_descs.size(), // numNewPointerArgs + new_value_descs.size(), // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + new_input_descs.data(), // pNewPointerArgList + new_value_descs.data(), // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + nullptr, // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize }; // Enqueue without calling urQueueFinish after ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, diff --git a/test/conformance/kernel/kernel_adapter_native_cpu.match b/test/conformance/kernel/kernel_adapter_native_cpu.match index 7ca10ec3d2..bd5333c609 100644 --- a/test/conformance/kernel/kernel_adapter_native_cpu.match +++ b/test/conformance/kernel/kernel_adapter_native_cpu.match @@ -38,6 +38,9 @@ urKernelRetainTest.InvalidNullHandleKernel/* urKernelSetArgLocalTest.Success/* urKernelSetArgLocalTest.InvalidNullHandleKernel/* urKernelSetArgLocalTest.InvalidKernelArgumentIndex/* +urKernelSetArgLocalMultiTest.Basic/* +urKernelSetArgLocalMultiTest.ReLaunch/* +urKernelSetArgLocalMultiTest.Overwrite/* urKernelSetArgMemObjTest.Success/* urKernelSetArgMemObjTest.InvalidNullHandleKernel/* urKernelSetArgMemObjTest.InvalidKernelArgumentIndex/* diff --git a/test/conformance/kernel/urKernelSetArgLocal.cpp b/test/conformance/kernel/urKernelSetArgLocal.cpp index 1d3789bf3a..380085bd16 100644 --- a/test/conformance/kernel/urKernelSetArgLocal.cpp +++ b/test/conformance/kernel/urKernelSetArgLocal.cpp @@ -3,6 +3,7 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include #include struct urKernelSetArgLocalTest : uur::urKernelTest { @@ -32,3 +33,203 @@ TEST_P(urKernelSetArgLocalTest, InvalidKernelArgumentIndex) { urKernelSetArgLocal(kernel, num_kernel_args + 1, local_mem_size, nullptr)); } + +// Test launching kernels with multiple local arguments return the expected +// outputs +struct urKernelSetArgLocalMultiTest : uur::urKernelExecutionTest { + void SetUp() override { + program_name = "saxpy_usm_local_mem"; + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + + ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, + sizeof(backend), &backend, nullptr)); + + // HIP has extra args for local memory so we define an offset for arg indices here for updating + hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0; + ur_device_usm_access_capability_flags_t shared_usm_flags; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; + } + + const size_t allocation_size = + sizeof(uint32_t) * global_size * local_size; + for (auto &shared_ptr : shared_ptrs) { + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, &shared_ptr)); + ASSERT_NE(shared_ptr, nullptr); + + std::vector pattern(allocation_size); + uur::generateMemFillPattern(pattern); + std::memcpy(shared_ptr, pattern.data(), allocation_size); + } + size_t current_index = 0; + // Index 0 is local_mem_a arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, current_index++, + local_mem_a_size, nullptr)); + + // Hip has extra args for local mem at index 1-3 + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, + sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, + sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, + sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + } + + // Index 1 is local_mem_b arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, current_index++, + local_mem_b_size, nullptr)); + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, + sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, + sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, + sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + } + + // Index 2 is output + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, current_index++, nullptr, + shared_ptrs[0])); + // Index 3 is A + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, sizeof(A), + nullptr, &A)); + // Index 4 is X + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, current_index++, nullptr, + shared_ptrs[1])); + // Index 5 is Y + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, current_index++, nullptr, + shared_ptrs[2])); + } + + void Validate(uint32_t *output, uint32_t *X, uint32_t *Y, uint32_t A, + size_t length, size_t local_size) { + for (size_t i = 0; i < length; i++) { + uint32_t result = A * X[i] + Y[i] + local_size; + ASSERT_EQ(result, output[i]); + } + } + + virtual void TearDown() override { + for (auto &shared_ptr : shared_ptrs) { + if (shared_ptr) { + EXPECT_SUCCESS(urUSMFree(context, shared_ptr)); + } + } + + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::TearDown()); + } + + static constexpr size_t local_size = 4; + static constexpr size_t local_mem_a_size = local_size * sizeof(uint32_t); + static constexpr size_t local_mem_b_size = local_mem_a_size * 2; + static constexpr size_t global_size = 16; + static constexpr size_t global_offset = 0; + static constexpr size_t n_dimensions = 1; + static constexpr uint32_t A = 42; + std::array shared_ptrs = {nullptr, nullptr, nullptr, nullptr, + nullptr}; + + uint32_t hip_arg_offset = 0; + static constexpr uint64_t hip_local_offset = 0; + ur_platform_backend_t backend{}; +}; +UUR_INSTANTIATE_KERNEL_TEST_SUITE_P(urKernelSetArgLocalMultiTest); + +TEST_P(urKernelSetArgLocalMultiTest, Basic) { + ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, + &global_offset, &global_size, + &local_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size, local_size); +} + +TEST_P(urKernelSetArgLocalMultiTest, ReLaunch) { + ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, + &global_offset, &global_size, + &local_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size, local_size); + + // Relaunch with new arguments + ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, + &global_offset, &global_size, + &local_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + uint32_t *new_output = (uint32_t *)shared_ptrs[0]; + uint32_t *new_X = (uint32_t *)shared_ptrs[3]; + uint32_t *new_Y = (uint32_t *)shared_ptrs[4]; + Validate(new_output, new_X, new_Y, A, global_size, local_size); +} + +// Overwrite local args to a larger value, then reset back to original +TEST_P(urKernelSetArgLocalMultiTest, Overwrite) { + ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, + &global_offset, &global_size, + &local_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size, local_size); + + size_t new_local_size = 2; + size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t); + size_t new_local_mem_b_size = new_local_size * sizeof(uint32_t) * 2; + size_t current_index = 0; + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, current_index++, + new_local_mem_a_size, nullptr)); + + // Hip has extra args for local mem at index 1-3 + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + } + + // Index 1 is local_mem_b arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, current_index++, + new_local_mem_b_size, nullptr)); + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + } + + ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, + &global_offset, &global_size, + &new_local_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + Validate(output, X, Y, A, global_size, new_local_size); +} diff --git a/test/conformance/queue/urQueueCreate.cpp b/test/conformance/queue/urQueueCreate.cpp index ad0957d747..8b26784ef5 100644 --- a/test/conformance/queue/urQueueCreate.cpp +++ b/test/conformance/queue/urQueueCreate.cpp @@ -2,6 +2,7 @@ // Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include "ur_api.h" #include "uur/raii.h" #include @@ -34,7 +35,8 @@ UUR_TEST_SUITE_P(urQueueCreateWithParamTest, UR_QUEUE_FLAG_SUBMISSION_BATCHED, UR_QUEUE_FLAG_SUBMISSION_IMMEDIATE, UR_QUEUE_FLAG_USE_DEFAULT_STREAM, - UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM), + UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM, + UR_QUEUE_FLAG_LOW_POWER_EVENTS_EXP), uur::deviceTestWithParamPrinter); TEST_P(urQueueCreateWithParamTest, SuccessWithProperties) { diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h index 436e7821a9..1900568292 100644 --- a/test/conformance/testing/include/uur/fixtures.h +++ b/test/conformance/testing/include/uur/fixtures.h @@ -519,6 +519,28 @@ struct urMultiQueueTest : urContextTest { ur_queue_handle_t queue2 = nullptr; }; +template +struct urMultiQueueTestWithParam : urContextTestWithParam { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::SetUp()); + ASSERT_SUCCESS(urQueueCreate(this->context, this->device, 0, &queue1)); + ASSERT_SUCCESS(urQueueCreate(this->context, this->device, 0, &queue2)); + } + + void TearDown() override { + if (queue1 != nullptr) { + EXPECT_SUCCESS(urQueueRelease(queue1)); + } + if (queue2 != nullptr) { + EXPECT_SUCCESS(urQueueRelease(queue2)); + } + UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::TearDown()); + } + + ur_queue_handle_t queue1 = nullptr; + ur_queue_handle_t queue2 = nullptr; +}; + template struct urMultiDeviceContextTestTemplate : urPlatformTest { void SetUp() override { diff --git a/tools/urinfo/urinfo.hpp b/tools/urinfo/urinfo.hpp index ee7fe52834..813ca34da1 100644 --- a/tools/urinfo/urinfo.hpp +++ b/tools/urinfo/urinfo.hpp @@ -28,6 +28,8 @@ inline void printAdapterInfos(ur_adapter_handle_t hAdapter, std::string_view prefix = " ") { std::cout << prefix; printAdapterInfo(hAdapter, UR_ADAPTER_INFO_BACKEND); + std::cout << prefix; + printAdapterInfo(hAdapter, UR_ADAPTER_INFO_VERSION); } inline void printPlatformInfos(ur_platform_handle_t hPlatform, @@ -420,5 +422,8 @@ inline void printDeviceInfos(ur_device_handle_t hDevice, hDevice, UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP); std::cout << prefix; printDeviceInfo(hDevice, UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP); + std::cout << prefix; + printDeviceInfo( + hDevice, UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP); } } // namespace urinfo