diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 5b7f9271ab034..196fffa19c569 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -41,11 +41,11 @@ sycl/include/sycl/detail/ur.hpp @intel/unified-runtime-reviewers sycl/source/detail/posix_ur.cpp @intel/unified-runtime-reviewers sycl/source/detail/ur.cpp @intel/unified-runtime-reviewers sycl/source/detail/windows_ur.cpp @intel/unified-runtime-reviewers -sycl/test-e2e/Plugin/ @intel/unified-runtime-reviewers +sycl/test-e2e/Adapters/ @intel/unified-runtime-reviewers # Win Proxy Loader -sycl/pi_win_proxy_loader @intel/llvm-reviewers-runtime -sycl/test-e2e/Plugin/dll-detach-order.cpp @intel/llvm-reviewers-runtime +sycl/ur_win_proxy_loader @intel/llvm-reviewers-runtime +sycl/test-e2e/Adapters/dll-detach-order.cpp @intel/llvm-reviewers-runtime # CUDA specific runtime implementations sycl/include/sycl/ext/oneapi/experimental/cuda/ @intel/llvm-reviewers-cuda diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl-linux-precommit.yml index fcb8852844cbb..231d49b0500c5 100644 --- a/.github/workflows/sycl-linux-precommit.yml +++ b/.github/workflows/sycl-linux-precommit.yml @@ -80,6 +80,12 @@ jobs: image: ghcr.io/intel/llvm/ubuntu2204_build:latest image_options: -u 1001 --gpus all --cap-add SYS_ADMIN target_devices: ext_oneapi_cuda:gpu + - name: AMD/HIP + runner: '["Linux", "amdgpu"]' + image: ghcr.io/intel/llvm/ubuntu2204_build:latest + image_options: -u 1001 --device=/dev/dri --device=/dev/kfd + target_devices: ext_oneapi_hip:gpu + reset_intel_gpu: false - name: Intel runner: '["Linux", "gen12"]' image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1954d8c8743b4..88207ccdbde96 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -58,7 +58,7 @@ To contribute: - [The seven rules of a great Git commit message](https://cbea.ms/git-commit) are recommended read and follow. - To a reasonable extent, title tags can be used to signify the component - changed, e.g.: `[PI]`, `[CUDA]`, `[Doc]`. + changed, e.g.: `[UR]`, `[CUDA]`, `[Doc]`. - Create a pull request (PR) for your changes following [Creating a pull request instructions](https://help.github.com/articles/creating-a-pull-request/). - Make sure PR has a good description explaining all of the changes made, diff --git a/buildbot/configure.py b/buildbot/configure.py index 5a7fdd003a1de..b77a7cb2d4c3e 100644 --- a/buildbot/configure.py +++ b/buildbot/configure.py @@ -69,7 +69,7 @@ def do_configure(args): if sys.platform != "darwin": sycl_enabled_backends.append("level_zero") - # lld is needed on Windows or for the HIP plugin on AMD + # lld is needed on Windows or for the HIP adapter on AMD if platform.system() == "Windows" or (args.hip and args.hip_platform == "AMD"): llvm_enable_projects += ";lld" @@ -152,8 +152,8 @@ def do_configure(args): libclc_targets_to_build += libclc_nvidia_target_names libclc_gen_remangled_variants = "ON" - if args.enable_plugin: - sycl_enabled_backends += args.enable_plugin + if args.enable_backends: + sycl_enabled_backends += args.enable_backends if args.disable_preview_lib: sycl_preview_lib = "OFF" @@ -374,7 +374,9 @@ def main(): parser.add_argument( "--ci-defaults", action="store_true", help="Enable default CI parameters" ) - parser.add_argument("--enable-plugin", action="append", help="Enable SYCL plugin") + parser.add_argument( + "--enable-backends", action="append", help="Enable SYCL backend" + ) parser.add_argument( "--disable-preview-lib", action="store_true", diff --git a/clang/lib/Driver/OffloadBundler.cpp b/clang/lib/Driver/OffloadBundler.cpp index d026805d22f4a..1839d52de7a89 100644 --- a/clang/lib/Driver/OffloadBundler.cpp +++ b/clang/lib/Driver/OffloadBundler.cpp @@ -687,12 +687,12 @@ class ObjectFileHandler final : public FileHandler { if (Error Err = Symbol.printName(NameOS)) return std::move(Err); - // If we are dealing with a bitcode file do not add special globals - // llvm.used and llvm.compiler.used and __AsanDeviceGlobalMetadata to + // If we are dealing with a bitcode file do not add special globals to // the list of defined symbols. if (SF->isIR() && (Name == "llvm.used" || Name == "llvm.compiler.used" || - Name == "__AsanDeviceGlobalMetadata")) + Name == "__AsanDeviceGlobalMetadata" || + Name == "__AsanKernelMetadata")) continue; // Add symbol name with the target prefix to the buffer. diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp index f0ec70bc1c116..788d6682e26c3 100644 --- a/clang/lib/Driver/SanitizerArgs.cpp +++ b/clang/lib/Driver/SanitizerArgs.cpp @@ -1197,6 +1197,9 @@ void SanitizerArgs::addArgs(const ToolChain &TC, const llvm::opt::ArgList &Args, CmdArgs.push_back("-mllvm"); CmdArgs.push_back("-asan-mapping-scale=4"); + + addSpecialCaseListOpt(Args, CmdArgs, + "-fsanitize-ignorelist=", UserIgnorelistFiles); } return; } diff --git a/devops/dependencies-igc-dev.json b/devops/dependencies-igc-dev.json index 7b692d6f63a0b..37cb1ee1d97f7 100644 --- a/devops/dependencies-igc-dev.json +++ b/devops/dependencies-igc-dev.json @@ -1,10 +1,10 @@ { "linux": { "igc_dev": { - "github_tag": "igc-dev-ac93a93", - "version": "ac93a93", - "updated_at": "2024-11-21T02:09:35Z", - "url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/2216471673/zip", + "github_tag": "igc-dev-7dad678", + "version": "7dad678", + "updated_at": "2024-11-24T10:48:51Z", + "url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/2229466354/zip", "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu" } } diff --git a/libdevice/sanitizer/asan_rtl.cpp b/libdevice/sanitizer/asan_rtl.cpp index 8908f69a51ea0..b6ee9fdd5b90c 100644 --- a/libdevice/sanitizer/asan_rtl.cpp +++ b/libdevice/sanitizer/asan_rtl.cpp @@ -665,11 +665,16 @@ constexpr size_t AlignMask(size_t n) { return n - 1; } /// /// ASAN Load/Store Report Built-ins /// +/// NOTE: +/// if __AsanLaunchInfo equals 0, the sanitizer is disabled for this launch +/// #define ASAN_REPORT_ERROR_BASE(type, is_write, size, as) \ DEVICE_EXTERN_C_NOINLINE void __asan_##type##size##_as##as( \ uptr addr, const char __SYCL_CONSTANT__ *file, uint32_t line, \ const char __SYCL_CONSTANT__ *func) { \ + if (!__AsanLaunchInfo) \ + return; \ if (addr & AlignMask(size)) { \ __asan_report_misalign_error(addr, as, size, is_write, addr, file, line, \ func); \ @@ -682,6 +687,8 @@ constexpr size_t AlignMask(size_t n) { return n - 1; } DEVICE_EXTERN_C_NOINLINE void __asan_##type##size##_as##as##_noabort( \ uptr addr, const char __SYCL_CONSTANT__ *file, uint32_t line, \ const char __SYCL_CONSTANT__ *func) { \ + if (!__AsanLaunchInfo) \ + return; \ if (addr & AlignMask(size)) { \ __asan_report_misalign_error(addr, as, size, is_write, addr, file, line, \ func, true); \ @@ -714,6 +721,8 @@ ASAN_REPORT_ERROR(store, true, 16) DEVICE_EXTERN_C_NOINLINE void __asan_##type##N_as##as( \ uptr addr, size_t size, const char __SYCL_CONSTANT__ *file, \ uint32_t line, const char __SYCL_CONSTANT__ *func) { \ + if (!__AsanLaunchInfo) \ + return; \ if (auto poisoned_addr = __asan_region_is_poisoned(addr, as, size)) { \ __asan_report_access_error(addr, as, size, is_write, poisoned_addr, \ file, line, func); \ @@ -722,6 +731,8 @@ ASAN_REPORT_ERROR(store, true, 16) DEVICE_EXTERN_C_NOINLINE void __asan_##type##N_as##as##_noabort( \ uptr addr, size_t size, const char __SYCL_CONSTANT__ *file, \ uint32_t line, const char __SYCL_CONSTANT__ *func) { \ + if (!__AsanLaunchInfo) \ + return; \ if (auto poisoned_addr = __asan_region_is_poisoned(addr, as, size)) { \ __asan_report_access_error(addr, as, size, is_write, poisoned_addr, \ file, line, func, true); \ @@ -743,6 +754,9 @@ ASAN_REPORT_ERROR_N(store, true) /// DEVICE_EXTERN_C_NOINLINE uptr __asan_mem_to_shadow(uptr ptr, uint32_t as) { + if (!__AsanLaunchInfo) + return 0; + return MemToShadow(ptr, as); } @@ -756,6 +770,9 @@ static __SYCL_CONSTANT__ const char __mem_set_shadow_local[] = DEVICE_EXTERN_C_NOINLINE void __asan_set_shadow_static_local(uptr ptr, size_t size, size_t size_with_redzone) { + if (!__AsanLaunchInfo) + return; + // Since ptr is aligned to ASAN_SHADOW_GRANULARITY, // if size != aligned_size, then the buffer tail of ptr is not aligned uptr aligned_size = RoundUpTo(size, ASAN_SHADOW_GRANULARITY); @@ -795,6 +812,9 @@ static __SYCL_CONSTANT__ const char __mem_unpoison_shadow_static_local_end[] = DEVICE_EXTERN_C_NOINLINE void __asan_unpoison_shadow_static_local(uptr ptr, size_t size, size_t size_with_redzone) { + if (!__AsanLaunchInfo) + return; + ASAN_DEBUG(__spirv_ocl_printf(__mem_unpoison_shadow_static_local_begin)); auto shadow_begin = MemToShadow(ptr + size, ADDRESS_SPACE_LOCAL); @@ -828,6 +848,9 @@ static __SYCL_CONSTANT__ const char __mem_report_arg_count_incorrect[] = DEVICE_EXTERN_C_NOINLINE void __asan_set_shadow_dynamic_local(uptr ptr, uint32_t num_args) { + if (!__AsanLaunchInfo) + return; + ASAN_DEBUG(__spirv_ocl_printf(__mem_set_shadow_dynamic_local_begin)); auto *launch_info = (__SYCL_GLOBAL__ const LaunchInfo *)__AsanLaunchInfo; @@ -859,6 +882,9 @@ static __SYCL_CONSTANT__ const char __mem_unpoison_shadow_dynamic_local_end[] = DEVICE_EXTERN_C_NOINLINE void __asan_unpoison_shadow_dynamic_local(uptr ptr, uint32_t num_args) { + if (!__AsanLaunchInfo) + return; + ASAN_DEBUG(__spirv_ocl_printf(__mem_unpoison_shadow_dynamic_local_begin)); auto *launch_info = (__SYCL_GLOBAL__ const LaunchInfo *)__AsanLaunchInfo; @@ -895,6 +921,9 @@ static __SYCL_CONSTANT__ const char __mem_set_shadow_private[] = DEVICE_EXTERN_C_NOINLINE void __asan_set_shadow_private(uptr begin, uptr size, char val) { + if (!__AsanLaunchInfo) + return; + ASAN_DEBUG(__spirv_ocl_printf(__mem_set_shadow_private_begin)); auto *launch_info = (__SYCL_GLOBAL__ const LaunchInfo *)__AsanLaunchInfo; diff --git a/llvm/include/llvm/SYCLLowerIR/AsanKernelMetadata.h b/llvm/include/llvm/SYCLLowerIR/AsanKernelMetadata.h new file mode 100644 index 0000000000000..56286927395b9 --- /dev/null +++ b/llvm/include/llvm/SYCLLowerIR/AsanKernelMetadata.h @@ -0,0 +1,23 @@ +//===-- AsanKernelMetadata.h - fix kernel medatadata for sanitizer ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This pass fixes attributes and metadata of the global variable +// "__AsanKernelMetadata" +//===----------------------------------------------------------------------===// + +#pragma once + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class AsanKernelMetadataPass : public PassInfoMixin { +public: + PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM); +}; + +} // namespace llvm diff --git a/llvm/lib/SYCLLowerIR/AsanKernelMetadata.cpp b/llvm/lib/SYCLLowerIR/AsanKernelMetadata.cpp new file mode 100644 index 0000000000000..2edaff49c471d --- /dev/null +++ b/llvm/lib/SYCLLowerIR/AsanKernelMetadata.cpp @@ -0,0 +1,63 @@ +//===-- AsanKernelMetadata.cpp - fix kernel medatadata for sanitizer -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This pass fixes attributes and metadata of global variable +// "__AsanKernelMetadata". +// We treat "__AsanKernelMetadata" as a device global variable, so that it can +// be read by runtime. +// "spirv.Decorations" is removed by llvm-link, so we add it here again. +//===----------------------------------------------------------------------===// + +#include "llvm/SYCLLowerIR/AsanKernelMetadata.h" + +#include "llvm/IR/IRBuilder.h" + +#define DEBUG_TYPE "AsanKernelMetadata" + +using namespace llvm; + +namespace llvm { + +constexpr StringRef SPIRV_DECOR_MD_KIND = "spirv.Decorations"; +constexpr uint32_t SPIRV_HOST_ACCESS_DECOR = 6147; + +PreservedAnalyses AsanKernelMetadataPass::run(Module &M, + ModuleAnalysisManager &MAM) { + auto *KernelMetadata = M.getNamedGlobal("__AsanKernelMetadata"); + if (!KernelMetadata) { + return PreservedAnalyses::all(); + } + + auto &DL = M.getDataLayout(); + auto &Ctx = M.getContext(); + + // Fix attributes + KernelMetadata->addAttribute( + "sycl-device-global-size", + std::to_string(DL.getTypeAllocSize(KernelMetadata->getValueType()))); + + // Fix metadata + unsigned MDKindID = Ctx.getMDKindID(SPIRV_DECOR_MD_KIND); + + SmallVector MDOps; + + SmallVector MD; + auto *Ty = Type::getInt32Ty(Ctx); + MD.push_back(ConstantAsMetadata::get( + Constant::getIntegerValue(Ty, APInt(32, SPIRV_HOST_ACCESS_DECOR)))); + MD.push_back( + ConstantAsMetadata::get(Constant::getIntegerValue(Ty, APInt(32, 0)))); + MD.push_back(MDString::get(Ctx, "_Z20__AsanKernelMetadata")); + + MDOps.push_back(MDNode::get(Ctx, MD)); + + KernelMetadata->addMetadata(MDKindID, *MDNode::get(Ctx, MDOps)); + + return PreservedAnalyses::none(); +} + +} // namespace llvm diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt index 9f0b7fe7e43b0..fc254a4809bb5 100644 --- a/llvm/lib/SYCLLowerIR/CMakeLists.txt +++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt @@ -69,6 +69,7 @@ add_llvm_component_library(LLVMSYCLLowerIR SYCLPropagateJointMatrixUsage.cpp SYCLVirtualFunctionsAnalysis.cpp SYCLUtils.cpp + AsanKernelMetadata.cpp LocalAccessorToSharedMemory.cpp GlobalOffset.cpp diff --git a/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp b/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp index fd79ac3cd9a31..c61728bc511fb 100644 --- a/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp +++ b/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp @@ -47,16 +47,7 @@ getSYCLESIMDSplitStatusFromMetadata(const Module &M) { } // namespace bool isModuleUsingAsan(const Module &M) { - for (const auto &F : M) { - if (F.getCallingConv() != CallingConv::SPIR_KERNEL) - continue; - if (F.arg_size() == 0) - continue; - const auto *LastArg = F.getArg(F.arg_size() - 1); - if (LastArg->getName() == "__asan_launch") - return true; - } - return false; + return M.getNamedGlobal("__AsanKernelMetadata"); } // This function traverses over reversed call graph by BFS algorithm. diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 67e98bb67013f..f85009b42be6c 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -843,10 +843,8 @@ struct AddressSanitizer { ArrayRef RetVec); bool instrumentSyclDynamicLocalMemory(Function &F, ArrayRef RetVec); + void instrumentInitAsanLaunchInfo(Function &F, const TargetLibraryInfo *TLI); - GlobalVariable *GetOrCreateGlobalString(Module &M, StringRef Name, - StringRef Value, - unsigned AddressSpace); void AppendDebugInfoToArgs(Instruction *InsertBefore, Value *Addr, SmallVectorImpl &Args); @@ -896,7 +894,6 @@ struct AddressSanitizer { FunctionCallee AsanSetShadowDynamicLocalFunc; FunctionCallee AsanUnpoisonShadowDynamicLocalFunc; Constant *AsanShadowGlobal; - StringMap GlobalStringMap; Constant *AsanLaunchInfo; // These arrays is indexed by AccessIsWrite, Experiment and log2(AccessSize). @@ -1300,22 +1297,81 @@ struct FunctionStackPoisoner : public InstVisitor { } // end anonymous namespace -// Append a new argument "launch_data" to user's spir_kernels -static void ExtendSpirKernelArgs(Module &M, FunctionAnalysisManager &FAM) { - SmallVector SpirFixupFuncs; - for (Function &F : M) { - // FIXME: We don't have a way to check if the kernel has been extended - // on Unified Runtime, so we always extend spir_kernels here, even it will - // not be instrumented by any asan function. - if (F.getCallingConv() == CallingConv::SPIR_KERNEL) - SpirFixupFuncs.emplace_back(&F); - } +static StringMap GlobalStringMap; + +static GlobalVariable *GetOrCreateGlobalString(Module &M, StringRef Name, + StringRef Value, + unsigned AddressSpace) { + GlobalVariable *StringGV = nullptr; + if (GlobalStringMap.find(Value.str()) != GlobalStringMap.end()) + return GlobalStringMap.at(Value.str()); + + auto *Ty = ArrayType::get(Type::getInt8Ty(M.getContext()), Value.size() + 1); + StringGV = new GlobalVariable( + M, Ty, true, GlobalValue::InternalLinkage, + ConstantDataArray::getString(M.getContext(), Value), Name, nullptr, + GlobalValue::NotThreadLocal, AddressSpace); + GlobalStringMap[Value.str()] = StringGV; + return StringGV; +} + +// Append a new argument "__asan_launch" to user's spir_kernels +static void ExtendSpirKernelArgs(Module &M, FunctionAnalysisManager &FAM, + bool HasESIMD) { + SmallVector SpirFixupKernels; + SmallVector SpirKernelsMetadata; + + auto DL = M.getDataLayout(); + Type *IntptrTy = DL.getIntPtrType(M.getContext()); + + // SpirKernelsMetadata only saves fixed kernels, and is described by + // following structure: + // uptr unmangled_kernel_name + // uptr unmangled_kernel_name_size + StructType *StructTy = StructType::get(IntptrTy, IntptrTy); + + if (!HasESIMD) + for (Function &F : M) { + if (F.getCallingConv() != CallingConv::SPIR_KERNEL) + continue; + + if (!F.hasFnAttribute(Attribute::SanitizeAddress) || + F.hasFnAttribute(Attribute::DisableSanitizerInstrumentation)) + continue; + + SpirFixupKernels.emplace_back(&F); + + auto KernelName = F.getName(); + auto *KernelNameGV = GetOrCreateGlobalString( + M, "__asan_kernel", KernelName, kSpirOffloadGlobalAS); + SpirKernelsMetadata.emplace_back(ConstantStruct::get( + StructTy, ConstantExpr::getPointerCast(KernelNameGV, IntptrTy), + ConstantInt::get(IntptrTy, KernelName.size()))); + } + + // Create global variable to record spirv kernels' information + ArrayType *ArrayTy = ArrayType::get(StructTy, SpirKernelsMetadata.size()); + Constant *MetadataInitializer = + ConstantArray::get(ArrayTy, SpirKernelsMetadata); + GlobalVariable *AsanSpirKernelMetadata = new GlobalVariable( + M, MetadataInitializer->getType(), false, GlobalValue::AppendingLinkage, + MetadataInitializer, "__AsanKernelMetadata", nullptr, + GlobalValue::NotThreadLocal, 1); + AsanSpirKernelMetadata->setUnnamedAddr(GlobalValue::UnnamedAddr::Local); + // Add device global attributes + AsanSpirKernelMetadata->addAttribute( + "sycl-device-global-size", std::to_string(DL.getTypeAllocSize(ArrayTy))); + AsanSpirKernelMetadata->addAttribute("sycl-device-image-scope"); + AsanSpirKernelMetadata->addAttribute("sycl-host-access", "0"); // read only + AsanSpirKernelMetadata->addAttribute("sycl-unique-id", + "_Z20__AsanKernelMetadata"); + AsanSpirKernelMetadata->setDSOLocal(true); + + // Handle SpirFixupKernels SmallVector> SpirFuncs; - auto *IntptrTy = - M.getDataLayout().getIntPtrType(M.getContext(), kSpirOffloadGlobalAS); - for (auto *F : SpirFixupFuncs) { + for (auto *F : SpirFixupKernels) { SmallVector Types; for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I) { @@ -1345,6 +1401,7 @@ static void ExtendSpirKernelArgs(Module &M, FunctionAnalysisManager &FAM) { } // New argument name NewI->setName("__asan_launch"); + NewI->addAttr(Attribute::NoUndef); NewF->splice(NewF->begin(), F); assert(F->isDeclaration() && @@ -1458,11 +1515,22 @@ PreservedAnalyses AddressSanitizerPass::run(Module &M, ClUseStackSafety ? &MAM.getResult(M) : nullptr; if (Triple(M.getTargetTriple()).isSPIROrSPIRV()) { - ExtendSpirKernelArgs(M, FAM); // FIXME: W/A skip instrumentation if this module has ESIMD + bool HasESIMD = false; for (auto &F : M) { - if (F.hasMetadata("sycl_explicit_simd")) - return PreservedAnalyses::all(); + if (F.hasMetadata("sycl_explicit_simd")) { + HasESIMD = true; + break; + } + } + + // Make sure "__AsanKernelMetadata" always exists + ExtendSpirKernelArgs(M, FAM, HasESIMD); + Modified = true; + + if (HasESIMD) { + GlobalStringMap.clear(); + return PreservedAnalyses::none(); } } @@ -1473,8 +1541,13 @@ PreservedAnalyses AddressSanitizerPass::run(Module &M, Options.UseAfterScope, Options.UseAfterReturn); const TargetLibraryInfo &TLI = FAM.getResult(F); Modified |= FunctionSanitizer.instrumentFunction(F, &TLI); + if (F.getCallingConv() == CallingConv::SPIR_KERNEL) + FunctionSanitizer.instrumentInitAsanLaunchInfo(F, &TLI); } Modified |= ModuleSanitizer.instrumentModule(); + + GlobalStringMap.clear(); + if (!Modified) return PreservedAnalyses::all(); @@ -1568,6 +1641,10 @@ static bool isUnsupportedDeviceGlobal(GlobalVariable *G) { if (!G->hasAttribute("sycl-device-image-scope")) return true; + // Skip instrumenting on "__AsanKernelMetadata" etc. + if (G->getName().starts_with("__Asan")) + return true; + Attribute Attr = G->getAttribute("sycl-device-image-scope"); return (!Attr.isStringAttribute() || Attr.getValueAsString() == "false"); } @@ -1618,22 +1695,6 @@ static bool isUnsupportedSPIRAccess(Value *Addr, Instruction *Inst) { return true; } -GlobalVariable *AddressSanitizer::GetOrCreateGlobalString( - Module &M, StringRef Name, StringRef Value, unsigned AddressSpace) { - GlobalVariable *StringGV = nullptr; - if (GlobalStringMap.find(Value.str()) != GlobalStringMap.end()) - return GlobalStringMap.at(Value.str()); - - auto *Ty = ArrayType::get(Type::getInt8Ty(M.getContext()), Value.size() + 1); - StringGV = new GlobalVariable( - M, Ty, true, GlobalValue::InternalLinkage, - ConstantDataArray::getString(M.getContext(), Value), Name, nullptr, - GlobalValue::NotThreadLocal, AddressSpace); - GlobalStringMap[Value.str()] = StringGV; - - return StringGV; -} - void AddressSanitizer::AppendDebugInfoToArgs(Instruction *InsertBefore, Value *Addr, SmallVectorImpl &Args) { @@ -1755,12 +1816,6 @@ bool AddressSanitizer::instrumentSyclDynamicLocalMemory( Function &F, ArrayRef RetVec) { InstrumentationIRBuilder IRB(F.getEntryBlock().getFirstNonPHI()); - // Save "__asan_launch" into local memory "__AsanLaunchInfo" - auto *LastArg = F.getArg(F.arg_size() - 1); - assert(LastArg->getName() == "__asan_launch" && - "Instrument on extended SPIR kernel function only"); - IRB.CreateStore(LastArg, AsanLaunchInfo); - SmallVector LocalArgs; for (auto &Arg : F.args()) { Type *PtrTy = dyn_cast(Arg.getType()->getScalarType()); @@ -1779,7 +1834,7 @@ bool AddressSanitizer::instrumentSyclDynamicLocalMemory( IRB.CreateStore(IRB.CreatePointerCast(LocalArgs[i], IntptrTy), StoreDest); } - auto ArgsArrayAddr = IRB.CreatePointerCast(ArgsArray, IntptrTy); + auto *ArgsArrayAddr = IRB.CreatePointerCast(ArgsArray, IntptrTy); IRB.CreateCall(AsanSetShadowDynamicLocalFunc, {ArgsArrayAddr, ConstantInt::get(Int32Ty, LocalArgs.size())}); @@ -1793,6 +1848,26 @@ bool AddressSanitizer::instrumentSyclDynamicLocalMemory( return true; } +// Initialize the value of local memory "__AsanLaunchInfo", store +// "__asan_launch" if it's an extended kernel, and store 0 if not +void AddressSanitizer::instrumentInitAsanLaunchInfo( + Function &F, const TargetLibraryInfo *TLI) { + InstrumentationIRBuilder IRB(F.getEntryBlock().getFirstNonPHI()); + if (F.arg_size()) { + auto *LastArg = F.getArg(F.arg_size() - 1); + if (LastArg->getName() == "__asan_launch") { + IRB.CreateStore(LastArg, AsanLaunchInfo); + return; + } + } + // FIXME: if the initial value of "__AsanLaunchInfo" is zero, we'll not need + // this step + initializeCallbacks(TLI); + IRB.CreateStore( + ConstantPointerNull::get(IntptrTy->getPointerTo(kSpirOffloadGlobalAS)), + AsanLaunchInfo); +} + // Instrument memset/memmove/memcpy void AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI, RuntimeCallInserter &RTCI) { diff --git a/llvm/test/Instrumentation/AddressSanitizer/SPIRV/extend_launch_info_arg.ll b/llvm/test/Instrumentation/AddressSanitizer/SPIRV/extend_launch_info_arg.ll index 33f4d9e07b794..073eca99cdd27 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/SPIRV/extend_launch_info_arg.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/SPIRV/extend_launch_info_arg.ll @@ -3,17 +3,18 @@ target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" target triple = "spir64-unknown-unknown" +; CHECK: @__AsanKernelMetadata = appending dso_local local_unnamed_addr addrspace(1) global ; CHECK: @__AsanLaunchInfo = external addrspace(3) global ptr addrspace(1) define spir_kernel void @sycl_kernel1() #0 { -; CHECK-LABEL: define spir_kernel void @sycl_kernel1(ptr addrspace(1) %__asan_launch) +; CHECK-LABEL: define spir_kernel void @sycl_kernel1(ptr addrspace(1) noundef %__asan_launch) entry: ; store ptr addrspace(1) %__asan_launch, ptr addrspace(3) @__AsanLaunchInfo, align 8 ret void } define spir_kernel void @sycl_kernel2() #0 { -; CHECK-LABEL: define spir_kernel void @sycl_kernel2(ptr addrspace(1) %__asan_launch) +; CHECK-LABEL: define spir_kernel void @sycl_kernel2(ptr addrspace(1) noundef %__asan_launch) entry: ; CHECK: store ptr addrspace(1) %__asan_launch, ptr addrspace(3) @__AsanLaunchInfo, align 8 call void @sycl_kernel1() @@ -22,3 +23,6 @@ entry: } attributes #0 = { sanitize_address } +;; sycl-device-global-size = 16 * 2 +;; sycl-host-access = 0 read-only +; CHECK: attributes #{{.*}} = { "sycl-device-global-size"="32" "sycl-device-image-scope" "sycl-host-access"="0" "sycl-unique-id"="_Z20__AsanKernelMetadata" } diff --git a/llvm/test/Instrumentation/AddressSanitizer/SPIRV/sycl_esimd.ll b/llvm/test/Instrumentation/AddressSanitizer/SPIRV/sycl_esimd.ll index ccb3de1092b4c..7794b3f513cbe 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/SPIRV/sycl_esimd.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/SPIRV/sycl_esimd.ll @@ -7,8 +7,12 @@ target triple = "spir64-unknown-unknown" ;; W/A: We skip asan completely if one module has esimd ;; +;; Although we disabled asan in this module, @__AsanKernelMetadata must exist since this module is compiled with asan +;; and we want the asan runtime to know this +; CHECK: @__AsanKernelMetadata + define spir_kernel void @sycl_kernel(ptr addrspace(1) %p) #0 { -; CHECK-LABEL: define spir_kernel void @sycl_kernel(ptr addrspace(1) %p, ptr addrspace(1) %__asan_launch) #0 +; CHECK-LABEL: define spir_kernel void @sycl_kernel(ptr addrspace(1) %p) entry: %0 = load i32, ptr addrspace(1) %p, align 4 ; CHECK-NOT: store ptr addrspace(1) %__asan_launch, ptr addrspace(3) @__AsanLaunchInfo, align 8 @@ -17,7 +21,7 @@ entry: } define spir_kernel void @esimd_kernel(ptr addrspace(1) %p) #0 !sycl_explicit_simd !1 { -; CHECK-LABEL: define spir_kernel void @esimd_kernel(ptr addrspace(1) %p, ptr addrspace(1) %__asan_launch) #0 +; CHECK-LABEL: define spir_kernel void @esimd_kernel(ptr addrspace(1) %p) entry: %0 = load i32, ptr addrspace(1) %p, align 4 ; CHECK-NOT: store ptr addrspace(1) %__asan_launch, ptr addrspace(3) @__AsanLaunchInfo, align 8 diff --git a/llvm/test/tools/sycl-post-link/device-sanitizer/asan.ll b/llvm/test/tools/sycl-post-link/device-sanitizer/asan.ll index c829f2dca9120..7fa89b236fa03 100644 --- a/llvm/test/tools/sycl-post-link/device-sanitizer/asan.ll +++ b/llvm/test/tools/sycl-post-link/device-sanitizer/asan.ll @@ -1,18 +1,24 @@ ; This test checks that the post-link tool properly generates "asanUsed=1" -; in [SYCL/misc properties] +; in prop file, and fixes the attributes and metadata of @__AsanKernelMetadata ; RUN: sycl-post-link -properties -split=kernel -symbols -S < %s -o %t.table -; RUN: FileCheck %s -input-file=%t_0.prop -; CHECK: [SYCL/misc properties] -; CHECK: asanUsed=1 + +; RUN: FileCheck %s -input-file=%t_0.prop --check-prefix CHECK-PROP +; CHECK-PROP: [SYCL/misc properties] +; CHECK-PROP: asanUsed=1 + +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix CHECK-IR ; ModuleID = 'parallel_for_int.cpp' source_filename = "parallel_for_int.cpp" target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" target triple = "spir64-unknown-unknown" -$_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E11MyKernelR_4 = comdat any +$_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E8MyKernel = comdat any +@__asan_kernel = internal addrspace(1) constant [55 x i8] c"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E8MyKernel\00" +@__AsanKernelMetadata = appending dso_local local_unnamed_addr addrspace(1) global [1 x { i64, i64 }] [{ i64, i64 } { i64 ptrtoint (ptr addrspace(1) @__asan_kernel to i64), i64 54 }] #2 +; CHECK-IR: @__AsanKernelMetadata {{.*}} !spirv.Decorations @__spirv_BuiltInGlobalInvocationId = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32 @__asan_func = internal addrspace(2) constant [106 x i8] c"typeinfo name for main::'lambda'(sycl::_V1::handler&)::operator()(sycl::_V1::handler&) const::MyKernelR_4\00" @@ -20,7 +26,7 @@ $_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E11MyKernelR_4 = comdat any declare void @llvm.assume(i1 noundef) #0 ; Function Attrs: mustprogress norecurse nounwind sanitize_address uwtable -define weak_odr dso_local spir_kernel void @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E11MyKernelR_4(ptr addrspace(1) noundef align 4 %_arg_array, i64 %__asan_launch) local_unnamed_addr #1 comdat !srcloc !7 !kernel_arg_buffer_location !8 !sycl_fixed_targets !9 { +define weak_odr dso_local spir_kernel void @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E8MyKernel(ptr addrspace(1) noundef align 4 %_arg_array, i64 %__asan_launch) local_unnamed_addr #1 comdat !srcloc !7 !kernel_arg_buffer_location !8 !sycl_fixed_targets !9 { entry: call spir_func void @__itt_offload_wi_start_wrapper() %0 = load i64, ptr addrspace(1) @__spirv_BuiltInGlobalInvocationId, align 32, !noalias !10 @@ -46,6 +52,7 @@ declare spir_func void @__itt_offload_wi_finish_wrapper() attributes #0 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } attributes #1 = { mustprogress norecurse nounwind sanitize_address uwtable "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-module-id"="parallel_for_int.cpp" "sycl-optlevel"="2" "uniform-work-group-size"="true" } +attributes #2 = { "sycl-device-global-size"="16" "sycl-device-image-scope" "sycl-host-access"="0" "sycl-unique-id"="_Z20__AsanKernelMetadata" } !llvm.module.flags = !{!0, !1, !2} !opencl.spir.version = !{!3} diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp index 0a15c42dc4333..e84faf464c42e 100644 --- a/llvm/tools/sycl-post-link/sycl-post-link.cpp +++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp @@ -29,6 +29,7 @@ #include "llvm/IRReader/IRReader.h" #include "llvm/Linker/Linker.h" #include "llvm/Passes/PassBuilder.h" +#include "llvm/SYCLLowerIR/AsanKernelMetadata.h" #include "llvm/SYCLLowerIR/CompileTimePropertiesPass.h" #include "llvm/SYCLLowerIR/ComputeModuleRuntimeInfo.h" #include "llvm/SYCLLowerIR/DeviceConfigFile.hpp" @@ -790,6 +791,13 @@ processInputModule(std::unique_ptr M) { if (M->getTargetTriple().find("spir") != std::string::npos) Modified |= removeDeviceGlobalFromCompilerUsed(*M.get()); + // AddressSanitizer specific passes + if (isModuleUsingAsan(*M)) { + // Fix attributes and metadata of the global variable + // "__AsanKernelMetadata" + Modified |= runModulePass(*M); + } + // Transform Joint Matrix builtin calls to align them with SPIR-V friendly // LLVM IR specification. Modified |= runModulePass(*M); diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp index 910bfef64cd0f..1d75136160e99 100644 --- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp +++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp @@ -175,6 +175,8 @@ jit_compiler::compileDeviceCode(InMemoryFile SourceFile, DerivedArgList DAL{UserArgList}; const auto &OptTable = getDriverOptTable(); DAL.AddFlagArg(nullptr, OptTable.getOption(OPT_fsycl_device_only)); + DAL.AddFlagArg(nullptr, + OptTable.getOption(OPT_fno_sycl_dead_args_optimization)); DAL.AddJoinedArg( nullptr, OptTable.getOption(OPT_resource_dir_EQ), (DPCPPRoot + "/lib/clang/" + Twine(CLANG_VERSION_MAJOR)).str()); @@ -518,5 +520,11 @@ jit_compiler::parseUserArgs(View UserArgs) { "Runtime compilation of ESIMD kernels is not yet supported"); } + if (AL.hasFlag(OPT_fsycl_dead_args_optimization, + OPT_fno_sycl_dead_args_optimization, false)) { + return createStringError( + "Dead argument optimization must be disabled for runtime compilation"); + } + return std::move(AL); } diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake index 60a97b0eba8e3..e9d52c5b23dac 100644 --- a/sycl/cmake/modules/FetchUnifiedRuntime.cmake +++ b/sycl/cmake/modules/FetchUnifiedRuntime.cmake @@ -116,7 +116,7 @@ if(SYCL_UR_USE_FETCH_CONTENT) CACHE PATH "Path to external '${name}' adapter source dir" FORCE) endfunction() - set(UNIFIED_RUNTIME_REPO "https://github.com/Naghasan/unified-runtime.git") + set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime") include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake) set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES") diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index bd3be13fd232e..b5c82c3ead941 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit 38ee6ce2a0400573c0c7c5da782bc32ff578fcc4 -# Merge: c7086f7f 91b6db04 +# commit db83117e830406b0d9950e24892dba868acba354 +# Merge: 0a90db9b c79df596 # Author: Callum Fare -# Date: Mon Nov 25 11:04:37 2024 +0000 -# Merge pull request #2303 from nrspruit/zeInitDrivers -# [L0] Implement Support for zeInitDrivers -set(UNIFIED_RUNTIME_TAG 7222f79b160681434250bac74f3aae0e4abc46f0) +# Date: Wed Nov 27 16:04:19 2024 +0000 +# Merge pull request #2261 from againull/againull/2d_block_exp +# Add new device descriptor to query 2D block array capabilities of the Intel GPU +set(UNIFIED_RUNTIME_TAG db83117e830406b0d9950e24892dba868acba354) diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 5ee2c40542ced..f018aac9c261d 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -23,7 +23,7 @@ compiler and runtime. | `SYCL_ENABLE_DEFAULT_CONTEXTS` | '1' or '0' | Enable ('1') or disable ('0') creation of default platform contexts in SYCL runtime. The default context for each platform contains all devices in the platform. Refer to [Platform Default Contexts](extensions/supported/sycl_ext_oneapi_default_context.asciidoc) extension to learn more. Enabled by default on Linux and disabled on Windows. | | `SYCL_RT_WARNING_LEVEL` | Positive integer | The higher warning level is used the more warnings and performance hints the runtime library may print. Default value is '0', which means no warning/hint messages from the runtime library are allowed. The value '1' enables performance warnings from device runtime/codegen. The values greater than 1 are reserved for future use. | | `SYCL_USM_HOSTPTR_IMPORT` | Integer | Enable by specifying non-zero value. Buffers created with a host pointer will result in host data promotion to USM, improving data transfer performance. To use this feature, also set SYCL_HOST_UNIFIED_MEMORY=1. | -| `SYCL_EAGER_INIT` | Integer | Enable by specifying non-zero value. Tells the SYCL runtime to do as much as possible initialization at objects construction as opposed to doing lazy initialization on the fly. This may mean doing some redundant work at warmup but ensures fastest possible execution on the following hot and reportable paths. It also instructs PI plugins to do the same. Default is "0". | +| `SYCL_EAGER_INIT` | Integer | Enable by specifying non-zero value. Tells the SYCL runtime to do as much as possible initialization at objects construction as opposed to doing lazy initialization on the fly. This may mean doing some redundant work at warmup but ensures fastest possible execution on the following hot and reportable paths. It also instructs UR adapters to do the same. Default is "0". | | `SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE` | See [below](#sycl_reduction_preferred_workgroup_size) | Controls the preferred work-group size of reductions. | | `SYCL_ENABLE_FUSION_CACHING` | '1' or '0' | Enable ('1') or disable ('0') caching of JIT compilations for kernel fusion. Caching avoids repeatedly running the JIT compilation pipeline if the same sequence of kernels is fused multiple times. Default value is '1'. | | `SYCL_JIT_AMDGCN_PTX_KERNELS` | '1' or '0' | Enable ('1') or disable ('0') JIT compilation of kernels. Only supported for Nvidia and AMD backends. Note, that it is required to have a valid binary for the desired backend (AMD or CUDA), that was compiled with `-fsycl-embed-ir` in order to use JIT-ing. When JIT-ing is enabled SYCL runtime will try to cache and reuse JIT-compiled kernels, furthermore if a kernel uses specialization constants the compiler will attempt to materialize the values in place, turning them to de-facto compile time constants. Default is '0'. | @@ -153,23 +153,23 @@ For a description of parallel for range rounding in DPC++ see | | | `MinRangeX`: The minimum X dimension of the range such that range rounding is activated (Default 1024) | -## Controlling DPC++ Level Zero Plugin +## Controlling DPC++ Level Zero Adapter | Environment variable | Values | Description | | -------------------- | ------ | ----------- | | `SYCL_ENABLE_PCI` (Deprecated) | Integer | When set to 1, enables obtaining the GPU PCI address when using the Level Zero backend. The default is 1. This option is kept for compatibility reasons and is immediately deprecated. | -| `SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR` | Any(\*) | Disable USM allocator in Level Zero plugin (each memory request will go directly to Level Zero runtime) | -| `SYCL_PI_LEVEL_ZERO_TRACK_INDIRECT_ACCESS_MEMORY` | Any(\*) | Enable support of the kernels with indirect access and corresponding deferred release of memory allocations in the Level Zero plugin. | +| `SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR` | Any(\*) | Disable USM allocator in Level Zero adapter (each memory request will go directly to Level Zero runtime) | +| `SYCL_PI_LEVEL_ZERO_TRACK_INDIRECT_ACCESS_MEMORY` | Any(\*) | Enable support of the kernels with indirect access and corresponding deferred release of memory allocations in the Level Zero adapter. | `(*) Note: Any means this environment variable is effective when set to any non-null value.` -## Controlling DPC++ CUDA Plugin +## Controlling DPC++ CUDA Adapter | Environment variable | Values | Description | | -------------------- | ------ | ----------- | | `SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE` | Integer | Specifies the maximum size of a local memory allocation in bytes. If the value exceeds the device's capabilities then a `sycl::runtime_error` is thrown. In order for the full error message to be printed, `SYCL_RT_WARNING_LEVEL=2` must be set. The default value for `SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE` is determined by the hardware. | -## Controlling DPC++ HIP Plugin +## Controlling DPC++ HIP Adapter | Environment variable | Values | Description | | -------------------- | ------ | ----------- | @@ -232,7 +232,6 @@ variables in production code. | after_addHostAcc | print graph after addHostAccessor method | | always | print graph before and after each of the above methods | - ### `SYCL_UR_TRACE` Options `SYCL_UR_TRACE` accepts a bit-mask, so individual tracing types can be enabled. @@ -258,7 +257,7 @@ Supported tracing levels are in the table below Any valid combination of the above bit-masks can be used to enable/disable tracing of the corresponding caches. If the input value is not 0 and not a valid number, the disk cache tracing will be enabled (deprecated behavior). The default value is 0 and no tracing is enabled. -## Debugging variables for Level Zero Plugin +## Debugging variables for Level Zero Adapter :warning: **Warning:** the environment variables described below are used for development and debugging of DPC++ compiler @@ -267,15 +266,15 @@ variables in production code. | Environment variable | Values | Description | | -------------------- | ------ | ----------- | -| `SYCL_PI_LEVEL_ZERO_SINGLE_THREAD_MODE` | Integer | A single-threaded app has an opportunity to enable this mode to avoid overhead from mutex locking in the Level Zero plugin. A value greater than 0 enables single thread mode. A value of 0 disables single thread mode. The default is 0. | +| `SYCL_PI_LEVEL_ZERO_SINGLE_THREAD_MODE` | Integer | A single-threaded app has an opportunity to enable this mode to avoid overhead from mutex locking in the Level Zero adapter. A value greater than 0 enables single thread mode. A value of 0 disables single thread mode. The default is 0. | | `SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR` | [EnableBuffers][;[MaxPoolSize][;[host\|device\|shared:][MaxPoolableSize][,[Capacity][,SlabMinSize]]]...] | EnableBuffers enables pooling for SYCL buffers, default 1, set to 0 to disable. MaxPoolSize is the maximum size of the pool, by default there is no size limit. MemType is host, device, shared or read_only_shared. Other parameters are values specified as positive integers with optional K, M or G suffix. MaxPoolableSize is the maximum allocation size that may be pooled, default 0 for shared, 2MB for host, 4MB for device and read_only_shared. Capacity is the number of allocations in each size range freed by the program but retained in the pool for reallocation, default 4. Size ranges follow this pattern: 64, 96, 128, 192, and so on, i.e., powers of 2, with one range in between. SlabMinSize is the minimum allocation size, 64KB for host and device, 2MB for shared and read_only_shared. Example: SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=1;32M;host:1M,4,64K;device:1M,4,64K;shared:0,0,2M| | `SYCL_PI_LEVEL_ZERO_BATCH_SIZE` | Integer | Sets a preferred number of compute commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. | | `SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE` | Integer | Sets a preferred number of copy commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. | | `SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST` | Integer | When set to 0, disables filtering of signaled events from wait lists when using the Level Zero backend. The default is 0. | -| `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE` | Any(\*) | This environment variable enables users to control use of copy engines for copy operations. If the value is an integer, it will allow the use of copy engines, if available in the device, in Level Zero plugin to transfer SYCL buffer or image data between the host and/or device(s) and to fill SYCL buffer or image data in device or shared memory. The value of this environment variable can also be a pair of the form "lower_index:upper_index" where the indices point to copy engines in a list of all available copy engines. The default is 0:0 when immediate command lists are being used on the device and 1 otherwise. (Also see description of SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS). | +| `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE` | Any(\*) | This environment variable enables users to control use of copy engines for copy operations. If the value is an integer, it will allow the use of copy engines, if available in the device, in Level Zero adapter to transfer SYCL buffer or image data between the host and/or device(s) and to fill SYCL buffer or image data in device or shared memory. The value of this environment variable can also be a pair of the form "lower_index:upper_index" where the indices point to copy engines in a list of all available copy engines. The default is 0:0 when immediate command lists are being used on the device and 1 otherwise. (Also see description of SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS). | | `SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE` | Integer | It can be set to an integer (>=0) in which case all compute commands will be submitted to the command-queue with the given index in the compute command group. If it is instead set to a negative value then all available compute engines may be used. The default value is "0" | -| `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY` (experimental) | Integer | Allows the use of copy engine, if available in the device, in Level Zero plugin for device to device copy operations. The default is 0. This option is experimental and will be removed once heuristics are added to make a decision about use of copy engine for device to device copy operations. | -| `SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS` | Any(\*) | Enable support of device-scope events whose state is not visible to the host. If enabled mode is SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1 the Level Zero plugin would create all events having device-scope only and create proxy host-visible events for them when their status is needed (wait/query) on the host. If enabled mode is SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=2 the Level Zero plugin would create all events having device-scope and add proxy host-visible event at the end of each command-list submission. The default is 0, meaning all events have host visibility. SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS is ignored when using immediate command lists (SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS = 1) and all events use default scope of 0. | +| `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY` (experimental) | Integer | Allows the use of copy engine, if available in the device, in Level Zero adapter for device to device copy operations. The default is 0. This option is experimental and will be removed once heuristics are added to make a decision about use of copy engine for device to device copy operations. | +| `SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS` | Any(\*) | Enable support of device-scope events whose state is not visible to the host. If enabled mode is SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1 the Level Zero adapter would create all events having device-scope only and create proxy host-visible events for them when their status is needed (wait/query) on the host. If enabled mode is SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=2 the Level Zero adapter would create all events having device-scope and add proxy host-visible event at the end of each command-list submission. The default is 0, meaning all events have host visibility. SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS is ignored when using immediate command lists (SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS = 1) and all events use default scope of 0. | | `SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS` | Integer | When set to a positive value enables use of Level Zero immediate commandlists, which means there is no batching and all commands are immediately submitted for execution. When set to 1, unique immediate commandlists are created for each SYCL queue. When set to 2, unique immediate commandlists are created per host thread per SYCL queue. Default is 1 on Intel® Data Center GPU Max Series running Linux and 0 elsewhere. | | `SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS` | Integer | When set to a positive value enables use of multiple Level Zero commandlists when submitting barriers. Default is 1. | | `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_FILL` | Integer | When set to a positive value enables use of a copy engine for memory fill operations. Default is 0. | @@ -287,7 +286,7 @@ variables in production code. | `SYCL_PI_LEVEL_ZERO_USM_RESIDENT` | Integer | Bit-mask controls if/where to make USM allocations resident at the time of allocation. Input value is of the form 0xHSD, where 4-bits of D control device allocations, 4-bits of S control shared allocations, and 4-bits of H control host allocations. Each 4-bit component is holding one of the following values: "0" - then no special residency is forced, "1" - then allocation is made resident at the device of allocation, or "2" - then allocation is made resident on all devices in the context of allocation that have P2P access to the device of allocation. Default is 0x002, i.e. force full residency for device allocations only. | | `SYCL_PI_LEVEL_ZERO_USE_NATIVE_USM_MEMCPY2D` | Integer | When set to a positive value enables the use of Level Zero USM 2D memory copy operations. Default is 0. | -## Debugging variables for CUDA Plugin +## Debugging variables for CUDA Adapter :warning: **Warning:** the environment variables described below are used for development and debugging of DPC++ compiler diff --git a/sycl/doc/FAQ.md b/sycl/doc/FAQ.md index 3c80dab3f6699..499314254d99a 100644 --- a/sycl/doc/FAQ.md +++ b/sycl/doc/FAQ.md @@ -138,8 +138,9 @@ OpenCL 2.1, so any device, capable of OpenCL 2.1, should be supported. Otherwise, your OpenCL device must support `cl_khr_il_program` extension. Furthermore, developers can extend capabilities of the DPC++ Runtime to -non-OpenCL devices by writing correspondent plugins. To learn more, please -check out our [Plugin Interface Guide](design/PluginInterface.md). +non-OpenCL devices by writing correspondent adapters. To learn more, please +check out the +[Unified Runtime project](https://github.com/oneapi-src/unified-runtime). ### Q: DPC++ applications hang on Intel GPUs while working well on other devices **A:** One of the common reasons is Intel GPUs feature called "hang check". diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md index 366a9ae49ea35..10359d6fefce3 100644 --- a/sycl/doc/GetStartedGuide.md +++ b/sycl/doc/GetStartedGuide.md @@ -256,24 +256,22 @@ instruction on how to install this refer to The DPC++ build assumes that ROCm is installed in `/opt/rocm`, if it is installed somewhere else, the directory must be provided through the CMake -variable `SYCL_BUILD_PI_HIP_ROCM_DIR` which can be passed using the +variable `UR_HIP_ROCM_DIR` which can be passed using the `--cmake-opt` option of `configure.py` as follows: ```sh python $DPCPP_HOME/llvm/buildbot/configure.py --hip \ - --cmake-opt=-DSYCL_BUILD_PI_HIP_ROCM_DIR=/usr/local/rocm + --cmake-opt=-DUR_HIP_ROCM_DIR=/usr/local/rocm ``` If further customization is required — for instance when the layout of -individual directories can not be inferred from `SYCL_BUILD_PI_HIP_ROCM_DIR` — +individual directories can not be inferred from `UR_HIP_ROCM_DIR` — it is possible to specify the location of HIP include, HSA include and HIP library directories, using the following CMake variables: -* `SYCL_BUILD_PI_HIP_INCLUDE_DIR`, -* `SYCL_BUILD_PI_HIP_HSA_INCLUDE_DIR`, -* `SYCL_BUILD_PI_HIP_LIB_DIR`. -Please note that a similar customization would also be required for Unified -Runtime, see [the list of options provided by its -CMake](https://github.com/oneapi-src/unified-runtime#cmake-standard-options) -for details. +* `UR_HIP_INCLUDE_DIR`, +* `UR_HIP_HSA_INCLUDE_DIR`, +* `UR_HIP_LIB_DIR`. +These options are all passed through to Unified Runtime, more detail about them +can be found [here](https://github.com/oneapi-src/unified-runtime#cmake-standard-options). [LLD](https://llvm.org/docs/AMDGPUUsage.html) is necessary for the AMDGPU compilation chain. The AMDGPU backend generates a standard ELF relocatable code @@ -355,7 +353,7 @@ under the hood to provide implementations of various SYCL backends. By default the source code for the Unified Runtime will be acquired using CMake's [FetchCotent](https://cmake.org/cmake/help/latest/module/FetchContent.html). The specific repository URL and revision tag used can be found in the file -`sycl/plugins/unified_runtime/CMakeLists.txt` searching for the variables +`sycl/cmake/modules/FetchUnifiedRuntime.cmake` searching for the variables `UNIFIED_RUNTIME_REPO` and `UNIFIED_RUNTIME_TAG`. In order to enable developers, a number of CMake variables are available to @@ -373,7 +371,7 @@ control which revision of Unified Runtime should be used when building DPC++: * The path specified by variable `SYCL_UR_SOURCE_DIR` will be used with `add_directory()`. This can be used to point at an adjacent directory containing a clone of the Unified Runtime repository. - * The path `sycl/plugins/unified_runtime/unified-runtime` will be used, if it + * The path `sycl/unified-runtime` will be used, if it exists. This can be used as-if an in-tree build. * `SYCL_UR_SOURCE_DIR` is a variable used to specify the path to the Unified Runtime repository when `SYCL_UR_USE_FETCH_CONTENT` is set of `OFF`. diff --git a/sycl/doc/design/Assert.md b/sycl/doc/design/Assert.md index 67b3da4203496..2105b8644f8ce 100644 --- a/sycl/doc/design/Assert.md +++ b/sycl/doc/design/Assert.md @@ -74,8 +74,8 @@ practical cases. and running on the device. - Native Device Compiler - compiler which generates device-native binary image based on input SPIR-V image. - - Low-level Runtime - the backend/runtime behind DPCPP Runtime attached via the - Plugin Interface. + - Low-level Runtime - the backend/runtime behind DPCPP Runtime accessed via + Unified Runtime. ## How it works? @@ -92,8 +92,8 @@ ID and the local ID of the failing work item. Implementation of this function is supplied by Native Device Compiler for safe approach or by DPCPP Compiler for fallback one. -In order to distinguish which implementation to use, DPCPP Runtime checks for -`PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT` extension. If the extension isn't +In order to distinguish which implementation to use, DPCPP Runtime checks the +backend for the "cl_intel_devicelib_assert" extension. If the extension isn't available, then fallback implementation is used. @@ -102,8 +102,7 @@ available, then fallback implementation is used. This is the preferred approach and implementations should use it when possible. It guarantees assertion failure notification delivery to the host regardless of kernel behavior which hit the assertion. If backend suports the safe approach, -it must report this capability to DPCPP Runtime via the -`PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT` extension query. +it must report support for the "cl_intel_devicelib_assert" device extension. The Native Device Compiler is responsible for providing implementation of `__devicelib_assert_fail` which completely hides details of communication @@ -128,10 +127,9 @@ The following sequence of events describes how user code gets notified: ## Fallback approach If Device-side Runtime doesn't support `__devicelib_assert_fail` (as reported -via `PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT` extension query) then a fallback -approach comes in place. The approach doesn't require any support from -Device-side Runtime and Native Device Compiler. Neither it does from Low-level -Runtime. +via "cl_intel_devicelib_assert" extension query) then a fallback approach comes +in place. The approach doesn't require any support from Device-side Runtime and +Native Device Compiler. Neither it does from Low-level Runtime. Within this approach, a mutable program scope variable is introduced. This variable stores a flag which says if an assert failure was encountered. Fallback @@ -234,8 +232,8 @@ with `#ifdef`'s. When in DPCPP Runtime Library this knowledge is obtained from device binary image descriptor's property sets. Each device image is supplied with an array of property sets. For description -of property sets see `struct pi_device_binary_struct` in -[`pi.h`](https://github.com/intel/llvm/blob/sycl/sycl/include/sycl/detail/pi.h#L692) +of property sets see `struct sycl_device_binary_struct` in +[`compiler.hpp`](https://github.com/intel/llvm/blob/7fc17d4b58e1b43882570a330afd84fab219aeba/sycl/source/detail/compiler.hpp#L132) A distinct property set `SYCL/assert used` is added. In this set a property with the name of the kernel is added whenever the kernel uses assert. The use of diff --git a/sycl/doc/design/CompilerAndRuntimeDesign.md b/sycl/doc/design/CompilerAndRuntimeDesign.md index 1935356d0a3a8..d021b2d1915db 100644 --- a/sycl/doc/design/CompilerAndRuntimeDesign.md +++ b/sycl/doc/design/CompilerAndRuntimeDesign.md @@ -387,8 +387,8 @@ when the parent fat binary is unloaded. The registration function basically takes the pointer to the offload descriptor and invokes SYCL runtime library's registration function passing it as a parameter. -The offload descriptor type hierarchy is described in the `pi.h` header. The -top-level structure is `pi_device_binaries_struct`. +The offload descriptor type hierarchy is described in the `compiler.hpp` +header. The top-level structure is `sycl_device_binaries_struct`. #### Device Link @@ -677,7 +677,7 @@ define void @SYCL_generated_kernel(i32 %local_ptr_offset, i32 %arg, i32 %local_p } ``` -On the runtime side, when setting local memory arguments, the CUDA PI +On the runtime side, when setting local memory arguments, the CUDA UR implementation will internally set the argument as the offset with respect to the accumulated size of used local memory. This approach preserves the existing PI interface. @@ -797,7 +797,7 @@ For the HIP backend, the resulting fused kernel is compiled to an AMDGCN binary by the kernel fusion JIT compiler at runtime, however this output requires finalization by `lld`. Rather than adding another dependancy to the fusion jit, a `Requires finalization` property is added the binary. The HIP -PI plugin/UR adapter will then use the AMD Compiler Object Manager library +UR adapter will then use the AMD Compiler Object Manager library (`comgr`, part of the ROCm package) in order to finalize it into a loadable format. diff --git a/sycl/doc/design/DeviceGlobal.md b/sycl/doc/design/DeviceGlobal.md index f864410e72ba8..3bbcc55b9a289 100644 --- a/sycl/doc/design/DeviceGlobal.md +++ b/sycl/doc/design/DeviceGlobal.md @@ -414,7 +414,7 @@ Each device code module that references one or more device global variables has an associated "SYCL/device globals" property set. The name of each property in this set is the `sycl-unique-id` string of a `device_global` variable that is contained by the module. The value of each property has -property type `PI_PROPERTY_TYPE_BYTE_ARRAY` and contains a structure with the +property type `SYCL_PROPERTY_TYPE_BYTE_ARRAY` and contains a structure with the following fields: ``` @@ -467,19 +467,19 @@ Several changes are needed to the DPC++ runtime #### Initializing the device global variables in device code When a DPC++ application submits a kernel, the runtime constructs a -`pi_program` containing this kernel that is compiled for the target device, if -such a `pi_program` does not yet exist. If the kernel resides in a device code -module that calls into a shared library, the runtime identifies a set of device -code modules that need to be online-linked together in order to construct the -`pi_program`. +`ur_program_handle_t` containing this kernel that is compiled for the target +device, if such a handle does not yet exist. If the kernel resides in a device +code module that calls into a shared library, the runtime identifies a set of +device code modules that need to be online-linked together in order to +construct the `ur_program_handle_t`. -After creating a `pi_program` and before invoking any kernel it contains, the -runtime does the following: +After creating a `ur_program_handle_t` and before invoking any kernel it +contains, the runtime does the following: * Scan the entries of the "SYCL/device globals" property sets of each device - code module that contributes to the the `pi_program` to get information about - each device global variable that is used by the `pi_program`. This - information is added to device global database. + code module that contributes to the the `ur_program_handle_t` to get + information about each device global variable that is used by the + `ur_program_handle_t`. This information is added to device global database. * For each device global variable that is not decorated with the `device_image_scope` property: @@ -492,12 +492,12 @@ runtime does the following: - Regardless of whether the USM buffer has already been created for the variable, the runtime initializes the `usmptr` member in the *device - instance* of the variable by using a new [PI interface][7] which copies - data from the host to a global variable in a `pi_program`. It is a - simple matter to use this interface to overwrite the `usmptr` member with - the address of the USM buffer. + instance* of the variable by using a new [UR interface][7] which copies + data from the host to a global variable in a `ur_program_handle_t`. It is + a simple matter to use this interface to overwrite the `usmptr` member + with the address of the USM buffer. -[7]: <#new-pi-interface-to-copy-to-or-from-a-module-scope-variable> +[7]: <#new-ur-interface-to-copy-to-or-from-a-module-scope-variable> Note that the runtime does not need to initialize the `val` member variable of device global variables that are decorated with `device_image_scope` because @@ -535,16 +535,16 @@ member. If the variable is decorated with the `device_image_scope` property, the variable's value exists directly in the device code module, not in a USM buffer. The runtime first uses the variable's unique string identifier to see -if there is a `pi_program` that contains the variable. If there is more than -one such `pi_program`, the runtime diagnoses an error by throwing -`errc::invalid`. If there is no such `pi_program`, the runtime scans all +if there is a `ur_program_handle_t that contains the variable. If there is +more than one such program handle, the runtime diagnoses an error by throwing +`errc::invalid`. If there is no such program handle, the runtime scans all "SYCL/device globals" property sets to find the device code module that contains this variable and uses its normal mechanism for creating a -`pi_program` from this device code module. (The algorithm for creating device -code modules in the `sycl-post-link` tool ensures that there will be no more -than one module that contains the variable.) Finally, the runtime uses the -new [PI interface][7] to copy to or from the contents of the variable in this -`pi_program`. +`ur_program_handle_t` from this device code module. (The algorithm for +creating device code modules in the `sycl-post-link` tool ensures that there +will be no more than one module that contains the variable.) Finally, the +runtime uses the new [UR interface][7] to copy to or from the contents of the +variable in this program. It is possible that a device global variable with `device_image_scope` is not referenced by _any_ kernel, in which case the variable's unique string will not @@ -556,42 +556,44 @@ In all cases, the runtime uses `sizeof(T)` to determine if the copy operation will read or write beyond the end of the device global variable's storage. If so, the runtime diagnoses an error by throwing `errc::invalid`. -#### New PI interface to copy to or from a module scope variable +#### New UR interface to copy to or from a module scope variable -As noted above, we need new PI interfaces that can copy data to or from an -instance of a device global variable in a `pi_program`. This functionality is -exposed as two new PI interfaces: +As noted above, we need new UR interfaces that can copy data to or from an +instance of a device global variable in a `ur_program_handle_t`. This +functionality is exposed as two new UR interfaces: ``` -pi_result piextEnqueueDeviceGlobalVariableRead( - pi_queue Queue, pi_program Program, const char *Name, pi_bool BlockingRead, - size_t Count, size_t Offset, void *Dst, pi_uint32 NumEventsInWaitList, - const pi_event *EventsWaitList, pi_event *Event); - -pi_result piextEnqueueDeviceGlobalVariableWrite( - pi_queue Queue, pi_program Program, const char *Name, pi_bool BlockingWrite, - size_t Count, size_t Offset, const void *Src, pi_uint32 NumEventsInWaitList, - const pi_event *EventsWaitList, pi_event *Event); +ur_result_t urEnqueueDeviceGlobalVariableRead( + ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name, + bool blockingRead, size_t count, size_t offset, void *pDst, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + +ur_result_t urEnqueueDeviceGlobalVariableWrite( + ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name, + bool blockingWrite, size_t count, size_t offset, const void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) ``` -The `piextEnqueueDeviceGlobalVariableRead` function reads `Count` bytes at -byte-offset `Offset` from a device global variable in `Program` identified by -the name `Name`. The read data is stored in `Dst`. Likewise, the -`piextEnqueueDeviceGlobalVariableWrite` function reads `Count` bytes from `Dst` -and stores them at byte-offset `Offset` in the device global variable in -`Program` identified by the name `Name`. +The `urEnqueueDeviceGlobalVariableRead` function reads `count` bytes at +byte-offset `offset` from a device global variable in `hProgram` identified by +the name `name`. The read data is stored in `pDst`. Likewise, the +`urEnqueueDeviceGlobalVariableWrite` function reads `count` bytes from `pSrc` +and stores them at byte-offset `offset` in the device global variable in +`hProgram` identified by the name `name`. -Both functions will enqueue the associated memory command on `Queue` where it -will first wait for `NumEventsInWaitList` events in `EventsWaitList` to finish. -`Event` will be populated with the event associated with resulting enqueued -command. If either `BlockingRead` or `BlockingWrite` is `true` the call will -block on the host until the enqueued command finishes execution. +Both functions will enqueue the associated memory command on `hQueue` where it +will first wait for `numEventsInWaitList` events in `phEventWaitList,` to +finish. `hEvent` will be populated with the event associated with resulting +enqueued command. If either `blockingRead` or `blockingWrite` is `true` the +call will block on the host until the enqueued command finishes execution. For `device_global` variables the `Name` parameter in calls to these functions is the same as the associated `sycl-unique-id` string. -The Level Zero backend has existing APIs that can implement these PI -interfaces. The plugin first calls [`zeModuleGetGlobalPointer()`][8] to get a +The Level Zero backend has existing APIs that can implement these UR +interfaces. The adapter first calls [`zeModuleGetGlobalPointer()`][8] to get a device pointer for the variable and then calls [`zeCommandListAppendMemoryCopy()`][9] to copy to or from that pointer. However, the documentation (and implementation) of `zeModuleGetGlobalPointer()` @@ -628,13 +630,13 @@ This must be changed to say something along these lines: The OpenCL backend has a proposed extension [`cl_intel_global_variable_access`][10] that defines functions `clEnqueueReadGlobalVariableINTEL()` and `clEnqueueWriteGlobalVariableINTEL()` -which can be easily used to implement these PI interfaces. This DPC++ design +which can be easily used to implement these UR interfaces. This DPC++ design depends upon implementation of that OpenCL extension. [10]: The CUDA backend has existing APIs `cuModuleGetGlobal()` and `cuMemcpyAsync()` -which can be used to implement these PI interfaces. +which can be used to implement these UR interfaces. ## Design choices diff --git a/sycl/doc/design/GlobalObjectsInRuntime.md b/sycl/doc/design/GlobalObjectsInRuntime.md index 96a9b8fdb05e0..b56dd7767d108 100644 --- a/sycl/doc/design/GlobalObjectsInRuntime.md +++ b/sycl/doc/design/GlobalObjectsInRuntime.md @@ -89,16 +89,30 @@ are not wrapped with `GlobalHandler`. `GlobalHandler` or consider using `sycl::detail::SpinLock`, which has trivial constructor and destructor. -## Plugins +## Adapters -Plugin lifetime is managed by utilizing piPluginInit() and piTearDown(). -GlobalHandler::shutdown() will tear down all registered globals before SYCL RT -library is unloaded. It will invoke piTearDown() and unload() for each -plugin. piTearDown() is going to perform any necessary tear-down process at the -plugin PI level. These two APIs allow on-demand plugin lifetime management. SYCL -RT can control the beginning and the end of the plugin. +Adapter lifetime is managed in two ways: on a per-adapter basis with +`urAdapterGet`/`urAdapterRelease`, and on a global basis with +`urLoaderInit`/`urLoaderTearDown`. A call to `urAdapterRelease` will make any +subsequent use of the adapter in question invalid, but it **does not** call the +`dlclose` equivalent on the adapter library. A call to `urLoaderTearDown` once +all initialized adapters have been released will unload all the adapter +libraries at once. -![](images/plugin-lifetime.jpg) +`GlobalHandler::unloadPlugins` calls both of these APIs in sequence in a pattern +something like this (pseudo code): + +``` +for (adapter in initializedAdapters) { + urAdapterRelease(adapter); +} +urLoaderTearDown(); +``` + +Which in turn is called by either `shutdown_late()` or `shutdown_win()` +depending on platform. + +![](images/adapter-lifetime.jpg) ## Low-level runtimes diff --git a/sycl/doc/design/KernelFusionJIT.md b/sycl/doc/design/KernelFusionJIT.md index 303f28106cac1..c194c86c48729 100644 --- a/sycl/doc/design/KernelFusionJIT.md +++ b/sycl/doc/design/KernelFusionJIT.md @@ -115,7 +115,7 @@ In this case, the individual kernels and the `KernelFusionCommand` could have be To handle this case, the `KernelFusionCommand` for each queue remains in the fusion map until the next `ext_codeplay_start_fusion` on the same queue and does not undergo the cleanup process until its status has been set to `DELETE` by `ext_codeplay_start_fusion`. This way, the lifetime of the event associated with the `KernelFusionCommand` is extended such that it is still valid for synchronization when returned from a later call to `ext_codeplay_complete_fusion`. -Note that even though the `KernelFusionCommand` is associated with a particular queue and context, the associated event does not have an underlying PI event, i.e., it is essentially a host event. +Note that even though the `KernelFusionCommand` is associated with a particular queue and context, the associated event does not have an underlying UR event, i.e., it is essentially a host event. ## Fusion Process diff --git a/sycl/doc/design/KernelProgramCache.md b/sycl/doc/design/KernelProgramCache.md index 3c408cf809ede..fd242e7d9749d 100644 --- a/sycl/doc/design/KernelProgramCache.md +++ b/sycl/doc/design/KernelProgramCache.md @@ -439,8 +439,8 @@ The caching isn't done when: - Employ the same built object for multiple devices of the same ISA, capabilities and so on. *NOTE:* It's not really known if it's possible to check if two distinct devices are *exactly* the same. Probably this should be - an improvement request for plugins. By now it is assumed that two devices with - the same device id 2 are the same. + an improvement request for the UR adapters. By now it is assumed that two + devices with the same device id 2 are the same. - Improve testing: cover real use-cases. See currently covered cases [here](https://github.com/intel/llvm/blob/sycl/sycl/unittests/kernel-and-program/Cache.cpp). - Implement tool for exploring cache items (initially it is possible using OS diff --git a/sycl/doc/design/OptionalDeviceFeatures.md b/sycl/doc/design/OptionalDeviceFeatures.md index 679ef4dd69eb4..53e88c3d123f2 100644 --- a/sycl/doc/design/OptionalDeviceFeatures.md +++ b/sycl/doc/design/OptionalDeviceFeatures.md @@ -867,10 +867,10 @@ types: Property Name | Property Type ------------- | ------------- -"aspect" | `PI_PROPERTY_TYPE_BYTE_ARRAY` -"reqd\_sub\_group\_size" | `PI_PROPERTY_TYPE_BYTE_ARRAY` -"reqd\_work\_group\_size" | `PI_PROPERTY_TYPE_BYTE_ARRAY` -"fixed\_target" | `PI_PROPERTY_TYPE_BYTE_ARRAY` +"aspect" | `SYCL_PROPERTY_TYPE_BYTE_ARRAY` +"reqd\_sub\_group\_size" | `SYCL_PROPERTY_TYPE_BYTE_ARRAY` +"reqd\_work\_group\_size" | `SYCL_PROPERTY_TYPE_BYTE_ARRAY` +"fixed\_target" | `SYCL_PROPERTY_TYPE_BYTE_ARRAY` The "aspect" property tells the set of aspects that a device must have in order to use the image. The image is only compatible with a device that supports diff --git a/sycl/doc/design/PluginInterface.md b/sycl/doc/design/PluginInterface.md deleted file mode 100644 index 7d0d4a262d3a6..0000000000000 --- a/sycl/doc/design/PluginInterface.md +++ /dev/null @@ -1,211 +0,0 @@ -# The DPC++ Runtime Plugin Interface. - -## Overview -The DPC++ Runtime Plugin Interface (PI) is an interface layer between the -device-agnostic part of DPC++ runtime and the device-specific runtime layers -which control execution on devices. It employs the “plugin” mechanism to bind -to the device specific runtime layers similar to what is used by libomptarget -or OpenCL. - -The picture below illustrates the placement of PI within the overall DPC++ -runtime stack. Dotted lines show components or paths which are not yet available -in the runtime, but are likely to be developed. -![PI in DPC++ runtime architecture](images/RuntimeArchitecture.svg) - -The plugin interface and the discovery process behind it allows to dynamically -plug in implementations based on OpenCL and “native” runtime for a particular -device – such as OpenCL for -FPGA devices or native runtimes for GPUs. Implementations of the PI are -“plugins” - dynamic libraries or shared objects which expose a number of entry -points implementing the PI interface. The DPC++ runtime collects those function -pointers into a PI interface dispatch table - one per plugin - and uses this -table to dispatch to the device(s) covered by the corresponding plugin. - -PI is based on a subset of OpenCL 1.2 runtime specification, it follows OpenCL's -platform, execution and memory models in all aspects except for those explicitly -mentioned in this document. Some of PI API types and functions have exact -matches in OpenCL. Whenever there is such a match, the semantics also fully -match unless the differences are explicitly specified in this document. While -PI has roots in OpenCL, it does have many differences, and the gap is likely -to grow, for example in areas of memory model and management, program -management. - -## Discovery and linkage of PI implementations - -![PI implementation discovery](images/PluginDiscovery.svg) - -Device discovery phase enumerates all available devices and their features by -querying underlying plugins found in the system. This process is performed when -all attached platforms or devices are queried in an application; for example, -during device selection. - -### Plugin discovery - -Plugins are physically dynamic libraries or shared objects. -The process to discover plugins follows the following guidelines. - -The DPC++ Runtime reads the names of the plugins from a configuration file -at a predetermined location (TBD - Add this location). These plugins are -searched at locations in env LD_LIBRARY_PATH on Linux and env PATH on Windows. -(TBD - Extend to search the plugins at a path relative to the SYCL Runtime -installation directory by using DT_RPATH on Linux. Similar functionality can be -achieved on Windows using SetDllDirectory. This will help avoiding extra setting -of LD_LIBRARY_PATH.) -To avoid any issues with read-only access, an environment variable -SYCL_PI_CONFIG can be set to point to the configuration file which lists the -Plugin names. The enviroment variable if set overrides the predetermined -location's config file. These Plugins are then be searched in LD_LIBRARY_PATH -locations. It is the developer's responsibility to include the plugin names from -the predetermined location's config file to enable discovery of all plugins. -(TBD - Extend to support search in DT_RPATH as above.) -In the current implementation the plugin names are hardcoded in the library. -Configuration file or env SYCL_PI_CONFIG is currently not being considered. - -A trace mechanism is provided using env SYCL_PI_TRACE to log the discovery/ -binding/ device enumeration process. Different levels of tracing can be achieved -with different values of SYCL_PI_TRACE. -SYCL_PI_TRACE=0x01 provides basic trace of plugins discovered and bound. It also -lists the device selector's selected device information. -SYCL_PI_TRACE=0x02 provides trace of all PI calls made from the DPC++ runtime -with arguments and returned values. -SYCL_PI_TRACE=-1 lists all PI Traces above and more debug messages. - -#### Plugin binary interface -Plugins should implement all the Interface APIs required for the PI Version it -supports. There is pi.def/ pi.h file listing all PI API names that can be -called by the specific version of Plugin Interface. It exports a function - -"piPluginInit" that returns the plugin details and function pointer table -containing the list of pointers to implemented Interface Functions defined in -pi.h. In the future, this document will list the minimum set of Interface APIs -to be supported by Plugins. This will also require adding functionality to SYCL -Runtime to work with such limited functionality plugins. - -(TBD - list and describe the symbols that a plugin must implement in order to -be picked up by the DPC++ runtime for offload.) - -#### Binding a Plugin -The DPC++ Runtime loads all discovered Plugins and tries to bind them by calling -piPluginInit API for each loaded Plugin. The Plugins return the information of -supported PI version and the list of implemented PI API Function pointers. -(TBD - Use the PI API Version information and check for compatibility. -Extend to support version compatibility checks without loading the library. -Eg:Changing the plugin name to reflect the supported Plugin Interface version.) -The information of compatible plugins (with the Function Pointer Table) is -stored in the associated platforms during platform object construction. -The PI API calls are later forwarded using this information. -A plugin is said to "bind" after this process completes with no errors. -During device selection, the user can prefer selection of a device from a -specific Plugin or Backend using the env ONEAPI_DEVICE_SELECTOR. The correspondence between -a plugin and a ONEAPI_DEVICE_SELECTOR value is currently hardcoded in the runtime. -( TBD: Make this a part of configuration file). -Eg: ONEAPI_DEVICE_SELECTOR=opencl:* corresponds to OpenCL Plugin. - -#### OpenCL plugin - -OpenCL plugin is a usual plugin from DPC++ runtime standpoint, but its loading -and initialization involves a nested discovery process which finds out available -OpenCL implementations. They can be installed either in the standard Khronos -ICD-compatible way (e.g. listed in files under /etc/OpenCL/vendors on -Linux) or not, and the OpenCL plugin can hook up with both. - -TBD - implement and describe the nested OpenCL implementation discovery process -performed by the OpenCL plugin - -### Device enumeration by plugins -Devices from all bound plugins are queried and listed as and when required, eg: -during device selection in device_selector. -The trace shows the PI API calls made when using SYCL_PI_TRACE=-1. -(TBD - Add the trace to list all available devices when plugins are successfully -bound.) - -### Plugin Unloading -The plugins not chosen to be connected to should be unloaded. piInitializePlugins() -can be called to load and bound the necessary plugins. In addition, piTearDown() -can be called when plugins are not needed any more. It notifies each -plugin to start performing its own tear-down process such as global memory -deallocation. In the future, piTearDown() can include any other jobs that need to -be done before the plugin is unloaded from memory. Possibly, a -notification of the plugin unloading to lower-level plugins can be added so that -they can clean up their own memory [TBD]. -After piTearDown() is called, the plugin can be safely unloaded by calling unload(), -which is going to invoke OS-specific system calls to remove the dynamic library -from memory. - -Each plugin should not create global variables that require non-trivial -destructor. Pointer variables with heap memory allocation is a good example -to be created at the global scope. A std::vector object is not. piTearDown -will take care of deallocation of these global variables safely. - -## PI API Specification - -PI interface is logically divided into few subsets: -- **Core API** which must be implemented by all plugins for DPC++ runtime to be -able to operate on the corresponding device. The core API further breaks down -into - - **OpenCL-based** APIs which have OpenCL origin and semantics - - **Extension** APIs which don't have counterparts in the OpenCL -- **Interoperability API** which allows interoperability with underlying -runtimes such as OpenCL. - -See pi.h header for the full list and descriptions of PI APIs. - -### The Core OpenCL-based PI APIs - -This subset defines functions representing core functionality, -such as device memory management, kernel creation and parameter setting, -enqueuing kernel for execution, etc. Functions in this subset fully match -semantics of the corresponding OpenCL functions, for example: - - piKernelCreate - piKernelRelease - piKernelSetArg - -### The Extension PI APIs - -Those APIs don't have OpenCL counter parts and require full specification. For -example, the function below selects the most appropriate device binary based -on runtime information and the binary's characteristics -``` -pi_result piextDeviceSelectBinary( - pi_device device, - pi_device_binary * binaries, - pi_uint32 num_binaries, - pi_device_binary * selected_binary); -``` - -PI also defines few types and string tags to describe a device binary image. -Those are used to communicate to plugins information about the images where it -is needed, currently only in the above function. The main -type is ```pi_device_binary```, whose detailed description can also be found -in the header. The layout of this type strictly matches the layout of the -corresponding device binary descriptor type defined in the -```clang-offload-wrapper``` tool which wraps device binaries into a host -object for further linkage. The wrapped binaries reside inside this descriptor -in a data section. - -### The Interoperability PI APIs - -These are APIs needed to implement DPC++ runtime interoperability with -underlying "native" device runtimes such as OpenCL. - -Interoperability extension APIs have been added to get native runtime handles -from the backend-agnostic PI Objects or to create PI Objects using the native -handles. Eg: - -``` -pi_result piextDeviceGetNativeHandle( - pi_device device, - pi_native_handle *nativeHandle); - -pi_result piextDeviceCreateWithNativeHandle( - pi_native_handle nativeHandle, - pi_device *device); - -``` - -### PI Extension mechanism - -TBD This section describes a mechanism for DPC++ or other runtimes to detect -availability of and obtain interfaces beyond those defined by the PI dispatch. - -TBD Add API to query PI version supported by plugin at runtime. diff --git a/sycl/doc/design/PropagateCompilerFlagsToRuntime.md b/sycl/doc/design/PropagateCompilerFlagsToRuntime.md index c73bd55b357d8..05a6011c89074 100644 --- a/sycl/doc/design/PropagateCompilerFlagsToRuntime.md +++ b/sycl/doc/design/PropagateCompilerFlagsToRuntime.md @@ -109,26 +109,26 @@ In the SYCL runtime, the device image properties can be accessed to extract the associated optimization level. Once the optimization level is available, it is converted to its equivalent frontend option string (`-O0`, `-O1`, `-O2`, or `-O3`). This frontend option string is passed into a -query that is made to the plugin to identify the correct backend option. This +query that is made to the adapter to identify the correct backend option. This backend option is added to the existing list of compiler options and is sent to the backend. -### Changes to the plugin +### Changes to the adapter -A new plugin API has been added. It takes the frontend option string as input in -string format and returns `pi_result`. A string format is used for sending the -frontend option so that this API can be used for querying other frontend -options as well. The signature of this API is as follows: +A new unified runtime API has been added. It takes the frontend option string +as input in string format and returns `ur_result_t`. A string format is used +for sending the frontend option so that this API can be used for querying other +frontend options as well. The signature of this API is as follows: ```C++ -pi_result piPluginGetBackendOption(pi_platform platform, - const char *frontend_option, - const char **backend_option); +ur_result_t urPlatformGetBackendOption(ur_platform_handle_t hPlatform, + const char *pFrontendOption, + const char **ppPlatformOption); ``` -In the level-zero and OpenCL plugins, the table provided in the 'Requirements' +In the level-zero and OpenCL adapters, the table provided in the 'Requirements' section is used as a guide to identify the appropriate backend option. -The option is returned in `backend_option`. For other plugins (HIP, cuda), -empty string is returned. This API returns `PI_SUCCESS` for +The option is returned in `ppPlatformOption`. For other adapters (HIP, cuda), +empty string is returned. This API returns `UR_RESULT_SUCCESS` for valid inputs (frontend_option != ""). For invalid inputs, it returns -`PI_ERROR_INVALID_VALUE`. +`UR_RESULT_ERROR_INVALID_VALUE`. diff --git a/sycl/doc/design/SYCL2020-SpecializationConstants.md b/sycl/doc/design/SYCL2020-SpecializationConstants.md index e05151794d62d..2e9b91969a313 100644 --- a/sycl/doc/design/SYCL2020-SpecializationConstants.md +++ b/sycl/doc/design/SYCL2020-SpecializationConstants.md @@ -225,7 +225,7 @@ ID was assigned to which symbolic ID). With help of `clang-offload-wrapper` tool, those device image properties are embedded into the application together with device code and used by DPC++ RT while handling specialization constants during application execution: it either -calls corresponding PI API to set a value of a specialization constant or it +calls corresponding UR API to set a value of a specialization constant or it fills a special buffer with values of specialization constants and passes it as kernel argument to emulate support of specialization constants. @@ -577,13 +577,13 @@ property_set { property { Name: "id_int_symbolic_ID", ValAddr: points to byte array [{0, 0, 4}], - Type: PI_PROPERTY_TYPE_BYTE_ARRAY, + Type: SYCL_PROPERTY_TYPE_BYTE_ARRAY, Size: sizeof(byte array above) }, property { Name: "id_A_symbolic_ID", ValAddr: points to byte array [{1, 0, 4}, {2, 4, 4}, {3, 8, 4}], - Type: PI_PROPERTY_TYPE_BYTE_ARRAY, + Type: SYCL_PROPERTY_TYPE_BYTE_ARRAY, Size: sizeof(byte array above) }, ] @@ -644,7 +644,7 @@ property_set { 1, 3.0, 4.0, // id_A 5.0, 6.0 // id_Nested ], - Type: PI_PROPERTY_TYPE_BYTE_ARRAY, + Type: SYCL_PROPERTY_TYPE_BYTE_ARRAY, Size: sizeof(byte array above) } ] @@ -950,14 +950,24 @@ If native specialization constants are supported by the target device, the runtime iterates through the value map and invokes ``` -pi_result piextProgramSetSpecializationConstant(pi_program prog, - pi_uint32 spec_id, - size_t spec_size, - const void *spec_value); +ur_result_t urProgramSetSpecializationConstants(ur_program_handle_t hProgram, + uint32_t count, + const ur_specialization_constant_info_t * + pSpecConstants) ``` -Plugin Interface function for descriptor of each property: `spec_id` and -`spec_size` are taken from the descriptor, `spec_value` is calculated based on +Unified Runtime function for descriptor of each property. The definition for +`ur_specialization_constant_info_t` is as follows: + +``` +struct ur_specialization_constant_info_t { + uint32_t id; + size_t size; + const void *pValue; +}; +``` + +`id` and `size` are taken from the descriptor, `pValue` is calculated based on address of the specialization constant provided by user and `offset` field of the descriptor as `(char*)(SpecConstantValuesMap[SymbolicID]) + offset`. diff --git a/sycl/doc/design/SYCLInstrumentationUsingXPTI.md b/sycl/doc/design/SYCLInstrumentationUsingXPTI.md index bac15cd555b91..a019ba515fc4d 100644 --- a/sycl/doc/design/SYCLInstrumentationUsingXPTI.md +++ b/sycl/doc/design/SYCLInstrumentationUsingXPTI.md @@ -73,12 +73,11 @@ To model this, we create a global graph for every application instantiation and all kernel executions in the applications are added as nodes in this global graph. In the SYCL runtime, there is no obvious location where the creation of the global graph can be inserted as many objects are -instantiated statically. Currently, we embed the graph creation in the -plugin interface (PI) layer `initialize()` call. In this call, we will -perform two operations: +instantiated statically. Currently, graph creation happens alongside UR +initialization in `initializePlugins` ([here](https://github.com/intel/llvm/blob/2137ff0e2ae0b478d341c12466bed0ac4402f516/sycl/source/detail/ur.cpp#L96)). +In this call, we will perform two operations: 1. Initialize all listeners and create a trace event to represent the graph. -This is done in `sycl/include/sycl/detail/pi.cpp`. 2. Send a `graph_create` event to all subscribers. This notification will only be sent once. diff --git a/sycl/doc/design/SYCLNativeCPU.md b/sycl/doc/design/SYCLNativeCPU.md index 28d19de097e76..86a78ae74b9a4 100644 --- a/sycl/doc/design/SYCLNativeCPU.md +++ b/sycl/doc/design/SYCLNativeCPU.md @@ -238,7 +238,7 @@ peeling loops. ## Kernel registration -In order to register the SYCL Native CPU kernels to the SYCL runtime, we applied a small change to the `clang-offload-wrapper` tool: normally, the `clang-offload-wrapper` bundles the offload binary in an LLVM-IR module. Instead of bundling the device code, for the SYCL Native CPU target we insert an array of function pointers to the `subhandler`s, and the `pi_device_binary_struct::BinaryStart` and `pi_device_binary_struct::BinaryEnd` fields, which normally point to the begin and end addresses of the offload binary, now point to the begin and end of the array. +In order to register the SYCL Native CPU kernels to the SYCL runtime, we applied a small change to the `clang-offload-wrapper` tool: normally, the `clang-offload-wrapper` bundles the offload binary in an LLVM-IR module. Instead of bundling the device code, for the SYCL Native CPU target we insert an array of function pointers to the `subhandler`s, and the `sycl_device_binary_struct::BinaryStart` and `sycl_device_binary_struct::BinaryEnd` fields, which normally point to the begin and end addresses of the offload binary, now point to the begin and end of the array. ``` ------------------------------------------------------- diff --git a/sycl/doc/design/SharedLibraries.md b/sycl/doc/design/SharedLibraries.md index 9bced26b71f67..f85160372cf00 100644 --- a/sycl/doc/design/SharedLibraries.md +++ b/sycl/doc/design/SharedLibraries.md @@ -256,11 +256,11 @@ mechanism. Each device image is supplied with an array of property sets: ```C++ -struct pi_device_binary_struct { +struct sycl_device_binary_struct { ... // Array of property sets - pi_device_binary_property_set PropertySetsBegin; - pi_device_binary_property_set PropertySetsEnd; + sycl_device_binary_property_set PropertySetsBegin; + sycl_device_binary_property_set PropertySetsEnd; }; ``` @@ -268,10 +268,10 @@ Each property set is represented by the following struct: ```C++ // Named array of properties. -struct _pi_device_binary_property_set_struct { +struct _sycl_device_binary_property_set_struct { char *Name; // the name - pi_device_binary_property PropertiesBegin; // array start - pi_device_binary_property PropertiesEnd; // array end + sycl_device_binary_property PropertiesBegin; // array start + sycl_device_binary_property PropertiesEnd; // array end }; ``` @@ -279,10 +279,10 @@ It contains name of property set and array of properties. Each property is represented by the following struct: ```C++ -struct _pi_device_binary_property_struct { +struct _sycl_device_binary_property_struct { char *Name; // null-terminated property name void *ValAddr; // address of property value - uint32_t Type; // _pi_property_type + uint32_t Type; // _sycl_property_type uint64_t ValSize; // size of property value in bytes }; ``` @@ -298,39 +298,42 @@ corresponding set has the name `SYCL/exported symbols`. DPC++ RT performs *device images collection* task by grouping all device images required to execute a kernel based on the list of exports/imports, creates -programs using collected images and links them together using PI API. +programs using collected images and links them together using UR API. Resulting program is then added to the cache to avoid repetition of symbol resolution, compilation, and linking processes for any future attempts to invoke kernels defined by this program. -#### DPC++ runtime plugin interface (PI) changes +#### DPC++ Unified Runtime changes During *device images collection* process RT considers modules as available for linking using information about ability of chosen device backend to compile and link programs created from particular device image format. The information about ability to compile and link particular format of device code is provided -by PI plugin implementation for concrete backend. For this purpose -`piDeviceGetInfo` API is used. For each device image format supported by DPC++ -RT PI device extension is defined. Each extension is a string that can be -returned by `piDeviceGetInfo` call with query `PI_DEVICE_INFO_EXTENSIONS`. -Mapping of extension strings and formats that can be linked: -| Device image format | Extension string | Meaning | +by UR adapter implementation for concrete backend. For this purpose the +`urDeviceSelectBinary` API is used. Each device image format supported by DPC++ +RT has a UR equivalent. To check if a backend is capable of linking a given +device image, a `ur_device_binary_t` struct is prepared with the appropriate UR +format string. Passing the struct to `urDeviceSelectBinary` will result in a +success code if the adapter supports the binary, or +`UR_RESULT_ERROR_INVALID_BINARY` otherwise. Mapping of extension strings and +formats that can be linked: +| Device image format | UR equivalent | Meaning | |---------------------|------------------|---------| -| `__SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64` | "pi_ext_spirv64_linking" | Linking of SPIR-V 64-bit programs is supported| -| `__SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_X86_64` | "pi_ext_spirv64_x86_64_linking" | Linking of 64-bit programs that were AOT compiled for CPU device is supported| -| `__SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_GEN` | "pi_ext_spirv64_gen_linking" | Linking of 64-bit programs that were AOT compiled for GPU device is supported| -| `__SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_FPGA` | "pi_ext_spirv64_fpga_linking" | Linking of 64-bit programs that were AOT compiled for FPGA device is supported| - -To link several device images together `piProgramLink` API will be used. -Depending on concrete plugin implementation and set of device image formats that -can be linked at run-time, `piProgramLink` API may receive programs made from +| `__SYCL_DEVICE_BINARY_TARGET_SPIRV64` | "UR_DEVICE_BINARY_TARGET_SPIRV64" | Linking of SPIR-V 64-bit programs is supported| +| `__SYCL_DEVICE_BINARY_TARGET_SPIRV64_X86_64` | "UR_DEVICE_BINARY_TARGET_SPIRV64_X86_64" | Linking of 64-bit programs that were AOT compiled for CPU device is supported| +| `__SYCL_DEVICE_BINARY_TARGET_SPIRV64_GEN` | "UR_DEVICE_BINARY_TARGET_SPIRV64_GEN" | Linking of 64-bit programs that were AOT compiled for GPU device is supported| +| `__SYCL_DEVICE_BINARY_TARGET_SPIRV64_FPGA` | "UR_DEVICE_BINARY_TARGET_SPIRV64_FPGA" | Linking of 64-bit programs that were AOT compiled for FPGA device is supported| + +To link several device images together `urProgramLink` API will be used. +Depending on concrete adapter implementation and set of device image formats that +can be linked at run-time, `urProgramLink` API may receive programs made from device images in different formats as inputs (including SPIR-V and native code). ##### Support of runtime linking in backends - The initial implementation will support dynamic linking of device code in SPIR-V format on OpenCL backend: - - OpenCL plugin will use the existing OpenCL `clLinkProgram()` API to online + - OpenCL adapter will use the existing OpenCL `clLinkProgram()` API to online link the SPIR-V modules together. - A new Level Zero API to online link programs on SPIR-V level is required for better performance. diff --git a/sycl/doc/design/UnifiedRuntime.md b/sycl/doc/design/UnifiedRuntime.md new file mode 100644 index 0000000000000..ab796a9d1cf0b --- /dev/null +++ b/sycl/doc/design/UnifiedRuntime.md @@ -0,0 +1,21 @@ +# Unified Runtime + +## Overview +The Unified Runtime project serves as an interface layer between the DPC++ +runtime and the device-specific runtime layers which control execution on +devices. The parts of it primarily utilized by DPC++ are its C API, loader +library, and the adapter libraries that implement the API for various backends. + +The DPC++ runtime accesses the UR api via the [Plugin](https://github.com/intel/llvm/blob/sycl/sycl/source/detail/plugin.hpp) +object. Each Plugin object owns a `ur_adapter_handle_t`, which represents a UR +backend (e.g. OpenCL, Level Zero, etc). + +The picture below illustrates the placement of UR within the overall DPC++ +runtime stack. Dotted lines show components or paths which are not yet available +in the runtime, but are likely to be developed. +![UR in DPC++ runtime architecture](images/RuntimeArchitecture.svg) + +For detailed information about the UR project including the API specification +see the +[Unified Runtime Documentation](https://oneapi-src.github.io/unified-runtime/core/INTRO.html). +You can find the Unified Runtime repo [here](https://github.com/oneapi-src/unified-runtime). diff --git a/sycl/doc/design/images/KernelFusionRuntimeWorkflow.svg b/sycl/doc/design/images/KernelFusionRuntimeWorkflow.svg index 125402fb324a2..d18cee31e58f1 100644 --- a/sycl/doc/design/images/KernelFusionRuntimeWorkflow.svg +++ b/sycl/doc/design/images/KernelFusionRuntimeWorkflow.svg @@ -2,14 +2,6 @@ image/svg+xmlcomplete + transform="matrix(1,0,0,-1,186.63,154.47)" + style="font-variant:normal;font-weight:normal;font-size:12.025px;font-family:ArialMT;-inkscape-font-specification:ArialMT;writing-mode:lr-tb;fill:#0c0c0c;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text96" + y="0">complete - + transform="matrix(1,0,0,-1,234.65,154.47)" + style="font-variant:normal;font-weight:normal;font-size:12.025px;font-family:ArialMT;-inkscape-font-specification:ArialMT;writing-mode:lr-tb;fill:#0c0c0c;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text108">- fusion + transform="matrix(1,0,0,-1,238.4,154.47)" + style="font-variant:normal;font-weight:normal;font-size:12.025px;font-family:ArialMT;-inkscape-font-specification:ArialMT;writing-mode:lr-tb;fill:#0c0c0c;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text120" + y="0">fusion Fuser + transform="matrix(1,0,0,-1,473.48,167.2)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text136" + y="0">Fuser gather + transform="matrix(-4.4e-8,1,1,4.4e-8,499.52,225.75)" + style="font-variant:normal;font-weight:normal;font-size:12.025px;font-family:ArialMT;-inkscape-font-specification:ArialMT;writing-mode:lr-tb;fill:#0c0c0c;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text150" + y="0">gather DPC++ SYCL RT + transform="matrix(1,0,0,-1,411.17,431.75)" + style="font-variant:normal;font-weight:300;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri-Light;writing-mode:lr-tb;fill:#0c0c0c;fill-opacity:1;fill-rule:nonzero;stroke:#0c0c0c;stroke-width:0.515;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1" + id="text162" + y="0">DPC++ SYCL RT queue + transform="matrix(1,0,0,-1,79.825,410.02)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text190" + y="0">queue enable_fusion + transform="matrix(1,0,0,-1,55.175,389.27)" + style="font-variant:normal;font-weight:normal;font-size:17.275px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text206" + y="0">enable_fusion Kernel 1 + transform="matrix(1,0,0,-1,183.3,333.02)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text242" + y="0">Kernel 1 Kernel 2 + transform="matrix(1,0,0,-1,183.3,275.13)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text274" + y="0">Kernel 2 Kernel 3 + transform="matrix(1,0,0,-1,182.9,218.63)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text306" + y="0">Kernel 3 submit + transform="matrix(-4.4e-8,1,1,4.4e-8,97.3,295.95)" + style="font-variant:normal;font-weight:normal;font-size:12px;font-family:ArialMT;-inkscape-font-specification:ArialMT;writing-mode:lr-tb;fill:#0c0c0c;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text320" + y="0">submit JIT Compiler + transform="matrix(1,0,0,-1,739.55,433.8)" + style="font-variant:normal;font-weight:300;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri-Light;writing-mode:lr-tb;fill:#0c0c0c;fill-opacity:1;fill-rule:nonzero;stroke:#0c0c0c;stroke-width:0.515;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1" + id="text336" + y="0">JIT Compiler SPIR + transform="matrix(1,0,0,-1,797.17,379.05)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text368" + y="0">SPIR - + transform="matrix(1,0,0,-1,828.7,379.05)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text380">- V to LLVM IR + transform="matrix(1,0,0,-1,833.97,379.05)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text392" + y="0">V to LLVM IR Kernel Fusion + transform="matrix(1,0,0,-1,812.2,321.13)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text424" + y="0">Kernel Fusion Constant + transform="matrix(1,0,0,-1,827.93,217.45)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text456" + y="0">Constant Propagation + transform="matrix(1,0,0,-1,815.92,195.67)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text468" + y="0">Propagation Internalization + transform="matrix(1,0,0,-1,807.3,264.63)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text500" + y="0">Internalization LLVM Opt. + transform="matrix(1,0,0,-1,824.2,149.43)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text532" + y="0">LLVM Opt. Input + transform="matrix(1,0,0,-1,682.97,408.33)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text558" + y="0">Input Fusion Info: + transform="matrix(1,0,0,-1,669.2,387.4)" + style="font-variant:normal;font-weight:normal;font-size:14.275px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text576" + y="0">Fusion Info: Constants + transform="matrix(1,0,0,-1,674.45,370.13)" + style="font-variant:normal;font-weight:normal;font-size:14.275px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text588" + y="0">Constants Properties + transform="matrix(1,0,0,-1,673.7,353.6)" + style="font-variant:normal;font-weight:normal;font-size:14.25px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text600" + y="0">Properties Ident. Param. + transform="matrix(1,0,0,-1,664.7,336.33)" + style="font-variant:normal;font-weight:normal;font-size:14.25px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text612" + y="0">Ident. Param. SPIR + transform="matrix(1,0,0,-1,659.7,289.33)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text630" + y="0">SPIR - + transform="matrix(1,0,0,-1,691.23,289.33)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text642">- V + transform="matrix(1,0,0,-1,696.47,289.33)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text654">V SPIR + transform="matrix(1,0,0,-1,680.65,271.42)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text672" + y="0">SPIR - + transform="matrix(1,0,0,-1,712.17,271.42)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text684">- V + transform="matrix(1,0,0,-1,717.42,271.42)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text696">V SPIR + transform="matrix(1,0,0,-1,700.8,253.55)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text714" + y="0">SPIR - + transform="matrix(1,0,0,-1,732.33,253.55)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text726">- V + transform="matrix(1,0,0,-1,737.57,253.55)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text738">V Output + transform="matrix(1,0,0,-1,676.2,186.7)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text754" + y="0">Output SPIR + transform="matrix(1,0,0,-1,679.95,105.75)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text772" + y="0">SPIR - + transform="matrix(1,0,0,-1,711.48,105.75)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text784">- V + transform="matrix(1,0,0,-1,716.72,105.75)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text796">V Kernel + transform="matrix(1,0,0,-1,679.95,156.03)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text814" + y="0">Kernel Info + transform="matrix(1,0,0,-1,688.98,134.25)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text826" + y="0">Info Plugin Interface (PI) + transform="scale(0.94614094,-1.056925)" + style="font-variant:normal;font-weight:normal;font-size:14.6712px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.946141" + id="text860" + y="-66.935669" + x="458.50406">Unified Runtime (UR) scheduler + transform="matrix(1,0,0,-1,415.9,399.92)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text878" + y="0">scheduler Fusion list: + transform="matrix(1,0,0,-1,416.33,313.73)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text896" + y="0">Fusion list: List of Command + transform="matrix(1,0,0,-1,393.05,291.95)" + style="font-variant:normal;font-weight:normal;font-size:18.025px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text908" + y="0">List of Command submit + transform="matrix(1,0,0,-1,229.3,412.17)" + style="font-variant:normal;font-weight:normal;font-size:12.025px;font-family:ArialMT;-inkscape-font-specification:ArialMT;writing-mode:lr-tb;fill:#0c0c0c;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text922" + y="0">submit store - \ No newline at end of file + transform="matrix(-4.4e-8,1,1,4.4e-8,448.17,353.15)" + style="font-variant:normal;font-weight:normal;font-size:12px;font-family:ArialMT;-inkscape-font-specification:ArialMT;writing-mode:lr-tb;fill:#0c0c0c;fill-opacity:1;fill-rule:nonzero;stroke:none" + id="text936" + y="0">store + diff --git a/sycl/doc/design/images/PluginDiscovery.svg b/sycl/doc/design/images/PluginDiscovery.svg deleted file mode 100644 index 2a462cd1fdb4e..0000000000000 --- a/sycl/doc/design/images/PluginDiscovery.svg +++ /dev/null @@ -1,702 +0,0 @@ - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - libsycl.so - - OCL ICD like discovery logic - - PI_device - X - _plugin - .so - - PI_OpenCL_plugin.so - - libOpenCL.so - - libdevice - X - _rt.so - - libOCL_ - Y - _rt.so - - libOCL_ - Z - _rt.so - - libOCL_ - CPU - _rt.so - - - - - - - - dlopen - dlopen - dlopen - - ICD - - - compatible OpenCL - installation - - “Custom” OpenCL - installation - - Non - - - OpenCL runtime - - diff --git a/sycl/doc/design/images/RuntimeArchitecture-with-fusion.svg b/sycl/doc/design/images/RuntimeArchitecture-with-fusion.svg index 2c6b9466e1362..c6e0f2568ba14 100644 --- a/sycl/doc/design/images/RuntimeArchitecture-with-fusion.svg +++ b/sycl/doc/design/images/RuntimeArchitecture-with-fusion.svg @@ -1,22 +1,21 @@ + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns="http://www.w3.org/2000/svg" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:dc="http://purl.org/dc/elements/1.1/"> @@ -25,7 +24,6 @@ image/svg+xml - @@ -38,18 +36,23 @@ guidetolerance="10" inkscape:pageopacity="0" inkscape:pageshadow="2" - inkscape:window-width="1920" - inkscape:window-height="1043" + inkscape:window-width="1403" + inkscape:window-height="969" id="namedview5075" showgrid="false" - inkscape:zoom="0.90509668" - inkscape:cx="322.95986" - inkscape:cy="401.46487" - inkscape:window-x="0" - inkscape:window-y="37" + inkscape:zoom="0.6889987" + inkscape:cx="555.87913" + inkscape:cy="476.77884" + inkscape:window-x="233" + inkscape:window-y="119" inkscape:window-maximized="0" inkscape:current-layer="svg5073" - viewbox-width="1000" /> + inkscape:pagecheckerboard="0" + inkscape:document-units="mm" + fit-margin-top="0" + fit-margin-left="0" + fit-margin-right="0" + fit-margin-bottom="0" /> + Fusion plugin + + Fusion JIT + DPC++ application DPC++ runtime PI plugin + style="fill:#858585;fill-opacity:1;stroke-width:1.31122">UR Adapter Native runtime & driver Device - Fusion plugin + + Fuser + DPC++ runtime library Scheduler Device mgr Program & kernel mgr SYCL API Memory mgr CPU TBB RT SYCL host / host device SPIR - V Device X exe + Host device RT interface - - PI/OpenCL plugin + x="956.3551" + y="236.42232" + style="stroke-width:1.21799">interface OpenMP RT DPC++ Runtime Plugin Interface (PI) + style="fill:#ffffff;fill-opacity:1;stroke-width:1.23042">Unified Runtime Interface Device Z OpenCL Runtime PI types & services - - Device binary - mgmt + style="stroke-width:1.2124">UR types & API DeviceX native RT PI/X RT plugin + style="font-style:normal;font-variant:normal;font-weight:400;font-size:24.311px;font-family:'Intel Clear';fill:#ffffff;stroke-width:1.30093" + id="tspan4993">UR/X RT adapter Device Y Other layers Device X - PI discovery + style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none" + x="240.61531" + y="604.38959" + id="text432" /> & plugin infra + x="19.902292" + y="401.99237" + id="tspan34554" + style="fill:#8a8b8b;fill-opacity:1;stroke-width:1.31122">UR Loader +API Device Binary + x="291.80307" + y="235.02141" + id="tspan62343" + style="font-size:15.4422px;text-align:center;text-anchor:middle;fill:#ffffff;fill-opacity:1;stroke-width:0.890895">mgr + style="fill:#0082b3;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1.09488" + d="m 489.41359,426.04946 v 29.06623 h 273.7227 v -29.06623 h -155.7192 -27.00183 z" + id="path96761" + inkscape:connector-curvature="0" + sodipodi:nodetypes="ccccccc" /> Fusion JIT - + x="536.64258" + y="441.89468" + id="tspan96765" + style="fill:#ffffff;fill-opacity:1;stroke-width:1.2124">UR loader + layers Fuser + x="533.41455" + y="490.1402" + id="tspan4877" + style="stroke-width:1.30093">UR/OpenCL adapter diff --git a/sycl/doc/design/images/RuntimeArchitecture.svg b/sycl/doc/design/images/RuntimeArchitecture.svg index 9c9e6ce472a8e..3cbf96f741e7c 100644 --- a/sycl/doc/design/images/RuntimeArchitecture.svg +++ b/sycl/doc/design/images/RuntimeArchitecture.svg @@ -1,21 +1,21 @@ + inkscape:export-ydpi="96" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns="http://www.w3.org/2000/svg" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:dc="http://purl.org/dc/elements/1.1/"> @@ -24,7 +24,6 @@ image/svg+xml - @@ -38,16 +37,18 @@ inkscape:pageopacity="0" inkscape:pageshadow="2" inkscape:window-width="1920" - inkscape:window-height="1137" + inkscape:window-height="1125" id="namedview5075" showgrid="false" - inkscape:zoom="0.97439131" - inkscape:cx="349.3594" - inkscape:cy="393.64385" - inkscape:window-x="1912" - inkscape:window-y="-8" + inkscape:zoom="1.3779974" + inkscape:cx="371.55368" + inkscape:cy="435.77731" + inkscape:window-x="0" + inkscape:window-y="0" inkscape:window-maximized="1" - inkscape:current-layer="svg5073" /> + inkscape:current-layer="svg5073" + inkscape:pagecheckerboard="0" + inkscape:document-units="mm" /> + DPC++ application DPC++ runtime PI plugin + style="stroke-width:1.31122;fill:#858585;fill-opacity:1">UR Adapter Native runtime & driver Device + DPC++ runtime library Scheduler Device mgr Program & kernel mgr SYCL API Memory mgr CPU TBB RT SYCL host / host device SPIR - V Device X exe Host device RT interface - - PI/OpenCL plugin + x="906.95837" + y="247.12788" + style="stroke-width:1.27314">interface OpenMP RT DPC++ Runtime Plugin Interface (PI) + style="fill:#ffffff;fill-opacity:1;stroke-width:1.23042">Unified Runtime Interface Device Z OpenCL Runtime PI types & services - - Device binary - mgmt + style="stroke-width:1.2124">UR types & API DeviceX native RT PI/X RT plugin + style="font-style:normal;font-variant:normal;font-weight:400;font-size:24.311px;font-family:'Intel Clear';fill:#ffffff;stroke-width:1.30093" + id="tspan4993">UR/X RT adapter Device Y Other layers Device X - + PI discovery + x="19.902292" + y="325.52847" + id="tspan34554" + style="stroke-width:1.31122;fill:#8a8b8b;fill-opacity:1">UR Loader +API & plugin infra + id="tspan62339" + x="282.74991" + y="226.91394" + style="font-size:16.2436px;text-align:center;text-anchor:middle;fill:#ffffff;fill-opacity:1;stroke-width:0.93713">Device Binarymgr + + x="536.64258" + y="361.60269" + id="tspan96765" + style="fill:#ffffff;fill-opacity:1;stroke-width:1.2124">UR loader + layers + + UR/OpenCL adapter diff --git a/sycl/doc/design/images/adapter-lifetime.jpg b/sycl/doc/design/images/adapter-lifetime.jpg new file mode 100644 index 0000000000000..1cd4f68e23df1 Binary files /dev/null and b/sycl/doc/design/images/adapter-lifetime.jpg differ diff --git a/sycl/doc/design/images/plugin-lifetime.jpg b/sycl/doc/design/images/plugin-lifetime.jpg deleted file mode 100644 index fcd3310f38a64..0000000000000 Binary files a/sycl/doc/design/images/plugin-lifetime.jpg and /dev/null differ diff --git a/sycl/doc/developer/ContributeToDPCPP.md b/sycl/doc/developer/ContributeToDPCPP.md index a096d99b33397..a141e69b901ce 100644 --- a/sycl/doc/developer/ContributeToDPCPP.md +++ b/sycl/doc/developer/ContributeToDPCPP.md @@ -35,7 +35,7 @@ See [Get Started Guide instructions](../GetStartedGuide.md) For any DPC++-related commit, the `[SYCL]` tag should be present in the commit message title. To a reasonable extent, additional tags can be used -to signify the component changed, e.g.: `[PI]`, `[CUDA]`, `[Doc]`. +to signify the component changed, e.g.: `[UR]`, `[CUDA]`, `[Doc]`. ## Using \ @@ -112,8 +112,11 @@ end-to-end or SYCL-CTS tests. unit tests. LIT tests make compile-time checks of DPC++ headers, e.g. device code IR verification, `static_assert` tests. Unit tests check DPC++ runtime behavior and do not perform any device code compilation, instead relying on - redefining plugin API with [PiMock](https://github.com/intel/llvm/blob/sycl/sycl/unittests/helpers/PiMock.hpp) when - necessary. + redefining the UR API with + [UrMock](https://github.com/intel/llvm/blob/sycl/sycl/unittests/helpers/UrMock.hpp) + and the + [Unified Runtime mocking interface](https://oneapi-src.github.io/unified-runtime/core/INTRO.html#mocking) + when necessary. When adding new test to `check-sycl`, please consider the following: diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index 8de3454b4b207..4313f8aafa84a 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -58,6 +58,7 @@ Jack Kirk, Codeplay + Ronan Keryell, AMD + Andrey Alekseenko, KTH Royal Institute of Technology + Fábio Mestre, Codeplay + +Konrad Kusiak, Codeplay + == Dependencies @@ -1979,6 +1980,12 @@ can be used adding nodes to a graph when creating a graph from queue recording. New methods are also defined that enable submitting an executable graph, e.g. directly to a queue without returning an event. +==== sycl_ext_oneapi_free_function_kernels + +`sycl_ext_oneapi_free_function_kernels`, defined in +link:../proposed/sycl_ext_oneapi_free_function_kernels.asciidoc[sycl_ext_oneapi_free_function_kernels] +can be used with SYCL Graphs. + ==== sycl_ext_oneapi_work_group_scratch_memory The new property defined by @@ -1992,7 +1999,7 @@ Removing this restriction is something we may look at for future revisions of == Examples and Usage Guide Detailed code examples and usage guidelines are provided in the -link:../../SYCLGraphUsageGuide.md[SYCL Graph Usage Guide]. +link:../../syclgraph/SYCLGraphUsageGuide.md[SYCL Graph Usage Guide]. == Future Direction [[future-direction]] diff --git a/sycl/doc/extensions/supported/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md index c01223a2e4fe0..6319e9e4751e2 100644 --- a/sycl/doc/extensions/supported/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md +++ b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md @@ -31,6 +31,7 @@ - [__regcall Calling convention](#__regcall-calling-convention) - [Inline assembly](#inline-assembly) - [Device aspect](#device-aspect) + - [Device Information Descriptors](#device-information-descriptors) - [Device queries and conditional dispatching of the code](#device-queries-and-conditional-dispatching-of-the-code) - [Implementation restrictions](#implementation-restrictions) - [Features not supported with the ESIMD extension](#features-not-supported-with-the-esimd-extension) @@ -1018,6 +1019,11 @@ The new aspect has the following behavior when queried via `device::has()`: |--------|-------------| |`aspect::ext_intel_esimd` | Indicates that the device supports the `sycl_ext_intel_esimd` extension as defined in this document. | +## Device Information Descriptors +| Device Descriptors | Return Type | Description | +| ------------------ | ----------- | ----------- | +| `ext::intel::esimd::info::device::has_2d_block_io_support` | bool | Returns a boolean indicating whether 2D load/store/prefetch instructions are supported by the device. | + ## Examples ### Vector addition (USM) ```cpp diff --git a/sycl/doc/extensions/supported/sycl_ext_intel_esimd/sycl_ext_intel_esimd_functions.md b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/sycl_ext_intel_esimd_functions.md index 5fb775dddc3bb..ae897ebc59d5a 100644 --- a/sycl/doc/extensions/supported/sycl_ext_intel_esimd/sycl_ext_intel_esimd_functions.md +++ b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/sycl_ext_intel_esimd_functions.md @@ -552,7 +552,7 @@ Loads and returns a vector `simd` where `N` is `BlockWidth * BlockHeight * `props` - The optional compile-time properties. Only cache hint properties are used. ### Restrictions -* This function is available only for Intel® Data Center GPU Max Series (aka PVC). +* This function is available only for devices with the `ext::intel::esimd::info::device::has_2d_block_io_support` information descriptor value equal to `true`. * `Cache-hint` properties, if passed, must follow the [rules](#valid-combinations-of-l1-and-l2-cache-hints-for-load-functions) for `load` functions. * `Transformed` and `Transposed` cannot be set to true at the same time. * `BlockWidth` * `BlockHeight` * `NBlocks` * sizeof(`T`) must not exceed 2048. @@ -598,7 +598,7 @@ Prefetches elements from a memory block of the size `BlockWidth * BlockHeight * `props` - The compile-time properties, which must specify cache-hints. ### Restrictions -* This function is available only for Intel® Data Center GPU Max Series (aka PVC). +* This function is available only for devices with the `ext::intel::esimd::info::device::has_2d_block_io_support` information descriptor value equal to `true`. * `Cache-hint` properties must follow the [rules](#valid-combinations-of-l1-and-l2-cache-hints-for-prefetch-functions) for `prefetch` functions. * `BlockWidth` * `BlockHeight` * `NBlocks` * sizeof(`T`) must not exceed 2048. * `NBlocks` must be {1,2,4} for `bytes` and `words`, {1,2} for `dwords`, 1 for `qwords`. @@ -630,7 +630,7 @@ Stores the vector `Vals` of the type `simd` to 2D memory block where `N` i `props` - The optional compile-time properties. Only cache hint properties are used. ### Restrictions -* This function is available only for Intel® Data Center GPU Max Series (aka PVC). +* This function is available only for devices with the `ext::intel::esimd::info::device::has_2d_block_io_support` information descriptor value equal to `true`. * `Cache-hint` properties, if passed, must follow the [rules](#valid-combinations-of-l1-and-l2-cache-hints-for-store-functions) for `store` functions. * `BlockWidth` * `BlockHeight` * sizeof(`T`) must not exceed 512. * `BlockHeight` must not exceed 8. diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_discard_queue_events.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_discard_queue_events.asciidoc index d56fbf052b7e7..f8e62d21a9a31 100644 --- a/sycl/doc/extensions/supported/sycl_ext_oneapi_discard_queue_events.asciidoc +++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_discard_queue_events.asciidoc @@ -193,7 +193,7 @@ the `discard_events` property. But using local accessors does not affect optimiz === Using Level Zero backend -Since Level Zero Plugin support is required to be able to not create a low-level event, +Since Level Zero adapter support is required to be able to not create a low-level event, any queue operations using the Level Zero backend temporarily work without optimization. diff --git a/sycl/doc/index.rst b/sycl/doc/index.rst index bfe2292e1ddcc..cc4961dd7f438 100644 --- a/sycl/doc/index.rst +++ b/sycl/doc/index.rst @@ -28,7 +28,7 @@ Design Documents for the oneAPI DPC++ Compiler Clang Documentation design/CompilerAndRuntimeDesign design/KernelParameterPassing - design/PluginInterface + design/UnifiedRuntime design/SYCL2020-SpecializationConstants design/KernelProgramCache design/GlobalObjectsInRuntime diff --git a/sycl/include/sycl/aliases.hpp b/sycl/include/sycl/aliases.hpp index fc359b7659802..cc42d99565d9a 100644 --- a/sycl/include/sycl/aliases.hpp +++ b/sycl/include/sycl/aliases.hpp @@ -140,19 +140,11 @@ using cl_double = double; } // namespace opencl // Vector aliases are different between SYCL 1.2.1 and SYCL 2020 -#if SYCL_LANGUAGE_VERSION >= 202001 __SYCL_2020_MAKE_VECTOR_ALIASES_FOR_VECTOR_LENGTH(2) __SYCL_2020_MAKE_VECTOR_ALIASES_FOR_VECTOR_LENGTH(3) __SYCL_2020_MAKE_VECTOR_ALIASES_FOR_VECTOR_LENGTH(4) __SYCL_2020_MAKE_VECTOR_ALIASES_FOR_VECTOR_LENGTH(8) __SYCL_2020_MAKE_VECTOR_ALIASES_FOR_VECTOR_LENGTH(16) -#else -__SYCL_MAKE_VECTOR_ALIASES_FOR_VECTOR_LENGTH(2) -__SYCL_MAKE_VECTOR_ALIASES_FOR_VECTOR_LENGTH(3) -__SYCL_MAKE_VECTOR_ALIASES_FOR_VECTOR_LENGTH(4) -__SYCL_MAKE_VECTOR_ALIASES_FOR_VECTOR_LENGTH(8) -__SYCL_MAKE_VECTOR_ALIASES_FOR_VECTOR_LENGTH(16) -#endif } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/backend/opencl.hpp b/sycl/include/sycl/backend/opencl.hpp index d7b2861363d87..7774bfd33d6fc 100644 --- a/sycl/include/sycl/backend/opencl.hpp +++ b/sycl/include/sycl/backend/opencl.hpp @@ -19,10 +19,21 @@ namespace sycl { inline namespace _V1 { namespace opencl { +namespace detail { +using namespace sycl::detail; __SYCL_EXPORT bool has_extension(const sycl::platform &SyclPlatform, - const std::string &Extension); + detail::string_view Extension); __SYCL_EXPORT bool has_extension(const sycl::device &SyclDevice, - const std::string &Extension); + detail::string_view Extension); +} // namespace detail +inline bool has_extension(const sycl::platform &SyclPlatform, + const std::string &Extension) { + return detail::has_extension(SyclPlatform, detail::string_view{Extension}); +} +inline bool has_extension(const sycl::device &SyclDevice, + const std::string &Extension) { + return detail::has_extension(SyclDevice, detail::string_view{Extension}); +} } // namespace opencl } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/detail/os_util.hpp b/sycl/include/sycl/detail/os_util.hpp index 8c4fc17913164..c77cdb4913c36 100644 --- a/sycl/include/sycl/detail/os_util.hpp +++ b/sycl/include/sycl/detail/os_util.hpp @@ -42,12 +42,22 @@ namespace detail { /// Groups the OS-dependent services. class __SYCL_EXPORT OSUtil { +#if !defined(__INTEL_PREVIEW_BREAKING_CHANGES) +#ifdef _WIN32 + // Access control is part of the mangling on Windows, have to preserve this + // for backward ABI compatibility. public: - /// Returns an absolute path to a directory where the object was found. - static std::string getCurrentDSODir(); - +#endif /// Returns a directory component of a path. static std::string getDirName(const char *Path); +#endif + +public: + /// Returns an absolute path to a directory where the object was found. +#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) + __SYCL_DLL_LOCAL +#endif + static std::string getCurrentDSODir(); #ifdef __SYCL_RT_OS_WINDOWS static constexpr const char *DirSep = "\\"; diff --git a/sycl/include/sycl/detail/string_view.hpp b/sycl/include/sycl/detail/string_view.hpp index 7815ecc4ce676..d36301efbfdac 100644 --- a/sycl/include/sycl/detail/string_view.hpp +++ b/sycl/include/sycl/detail/string_view.hpp @@ -37,14 +37,19 @@ class string_view { const char *data() const noexcept { return str; } - friend bool operator==(const string_view &lhs, - std::string_view rhs) noexcept { + friend bool operator==(string_view lhs, std::string_view rhs) noexcept { return rhs == lhs.data(); } - friend bool operator==(std::string_view lhs, - const string_view &rhs) noexcept { + friend bool operator==(std::string_view lhs, string_view rhs) noexcept { return lhs == rhs.data(); } + + friend bool operator!=(string_view lhs, std::string_view rhs) noexcept { + return rhs != lhs.data(); + } + friend bool operator!=(std::string_view lhs, string_view rhs) noexcept { + return lhs != rhs.data(); + } }; } // namespace detail diff --git a/sycl/include/sycl/ext/intel/experimental/online_compiler.hpp b/sycl/include/sycl/ext/intel/experimental/online_compiler.hpp index 769dbe45050fc..feeaa30d9e2ba 100644 --- a/sycl/include/sycl/ext/intel/experimental/online_compiler.hpp +++ b/sycl/include/sycl/ext/intel/experimental/online_compiler.hpp @@ -81,6 +81,21 @@ class __SYCL2020_DEPRECATED( "experimental online_compiler is being deprecated. See " "'sycl_ext_oneapi_kernel_compiler.asciidoc' instead for new kernel " "compiler extension to kernel_bundle implementation.") online_compiler { + __SYCL_EXPORT std::vector + compile_impl(sycl::detail::string_view Src, + sycl::detail::string_view DeviceStepping, + const std::vector &Options); + + std::vector compile_impl(const std::string &Source, + const std::vector &UserArgs) { + std::vector Args; + for (auto &&Arg : UserArgs) + Args.emplace_back(Arg); + + return compile_impl(std::string_view{Source}, + std::string_view{DeviceStepping}, Args); + } + public: /// Constructs online compiler which can target any device and produces /// given compiled code format. Produces 64-bit device code. @@ -196,9 +211,17 @@ class __SYCL2020_DEPRECATED( /// OpenCL JIT compiler options must be supported. template <> template <> -__SYCL_EXPORT std::vector -online_compiler::compile( - const std::string &src, const std::vector &options); +#if !defined(__SYCL_ONLINE_COMPILER_CPP) || \ + defined(__INTEL_PREVIEW_BREAKING_CHANGES) +inline +#else +__SYCL_EXPORT +#endif + std::vector + online_compiler::compile( + const std::string &src, const std::vector &options) { + return compile_impl(src, options); +} /// Compiles the given OpenCL source. May throw \c online_compile_error. /// @param src - contents of the source. @@ -214,8 +237,17 @@ online_compiler::compile(const std::string &src) { /// @param options - compilation options (implementation defined). template <> template <> -__SYCL_EXPORT std::vector online_compiler::compile( - const std::string &src, const std::vector &options); +#if !defined(__SYCL_ONLINE_COMPILER_CPP) || \ + defined(__INTEL_PREVIEW_BREAKING_CHANGES) +inline +#else +__SYCL_EXPORT +#endif + std::vector + online_compiler::compile( + const std::string &src, const std::vector &options) { + return compile_impl(src, options); +} /// Compiles the given CM source \p src. template <> diff --git a/sycl/include/sycl/ext/intel/experimental/pipes.hpp b/sycl/include/sycl/ext/intel/experimental/pipes.hpp index 76b0acf7c8d42..4313673f49082 100644 --- a/sycl/include/sycl/ext/intel/experimental/pipes.hpp +++ b/sycl/include/sycl/ext/intel/experimental/pipes.hpp @@ -46,10 +46,41 @@ class pipe_base { pipe_base() = default; ~pipe_base() = default; + __SYCL_EXPORT static sycl::detail::string + get_pipe_name_impl(const void *HostPipePtr); + +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + static std::string get_pipe_name(const void *HostPipePtr) { + return {get_pipe_name_impl(HostPipePtr).c_str()}; + } +#else __SYCL_EXPORT static std::string get_pipe_name(const void *HostPipePtr); +#endif + __SYCL_EXPORT static bool wait_non_blocking(const event &E); }; +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES +// We want all "new" uses/recompilation to use the "inline" version, yet we +// still need to provide an exported symbol for the code that was compiled +// before that. Make sure we use "inline" everywhere except when compiling +// `pipes.cpp` so that we'd still provide this backward-compatibility ABI symbol +// via `pipes.cpp` TU. +#ifdef __SYCL_PIPES_CPP +// Magic combination found by trial and error: +__SYCL_EXPORT +#ifdef WIN32 +inline +#endif +#else +inline +#endif + std::string + pipe_base::get_pipe_name(const void *HostPipePtr) { + return {get_pipe_name_impl(HostPipePtr).c_str()}; +} +#endif + template diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index 1756615f87265..99b4f5991da59 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -447,7 +447,8 @@ class command_graph : public detail::modifiable_command_graph { /// Constructor. /// @param SyclQueue Queue to use for the graph device and context. /// @param PropList Optional list of properties to pass. - command_graph(const queue &SyclQueue, const property_list &PropList = {}) + explicit command_graph(const queue &SyclQueue, + const property_list &PropList = {}) : modifiable_command_graph(SyclQueue, PropList) {} private: diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 23b66eb093987..41a9978d7ccd4 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -68,18 +68,7 @@ // 41(!!!) includes of SYCL headers + 10 includes of standard headers. // 3300+ lines of code -// SYCL_LANGUAGE_VERSION is 4 digit year followed by 2 digit revision -#if !SYCL_LANGUAGE_VERSION || SYCL_LANGUAGE_VERSION < 202001 -#define __SYCL_NONCONST_FUNCTOR__ -#endif - -// replace _KERNELFUNCPARAM(KernelFunc) with KernelType KernelFunc -// or const KernelType &KernelFunc -#ifdef __SYCL_NONCONST_FUNCTOR__ -#define _KERNELFUNCPARAMTYPE KernelType -#else #define _KERNELFUNCPARAMTYPE const KernelType & -#endif #define _KERNELFUNCPARAM(a) _KERNELFUNCPARAMTYPE a #if defined(__SYCL_UNNAMED_LAMBDA__) diff --git a/sycl/include/sycl/info/ext_intel_device_traits.def b/sycl/include/sycl/info/ext_intel_device_traits.def index f828b43e2a1d7..50b4e9eec952a 100644 --- a/sycl/include/sycl/info/ext_intel_device_traits.def +++ b/sycl/include/sycl/info/ext_intel_device_traits.def @@ -16,6 +16,7 @@ __SYCL_PARAM_TRAITS_SPEC(ext::intel, device, free_memory, uint64_t, UR_DEVICE_IN __SYCL_PARAM_TRAITS_SPEC(ext::intel, device, memory_clock_rate, uint32_t, UR_DEVICE_INFO_MEMORY_CLOCK_RATE) __SYCL_PARAM_TRAITS_SPEC(ext::intel, device, memory_bus_width, uint32_t, UR_DEVICE_INFO_MEMORY_BUS_WIDTH) __SYCL_PARAM_TRAITS_SPEC(ext::intel, device, max_compute_queue_indices, int32_t, UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES) +__SYCL_PARAM_TRAITS_SPEC(ext::intel::esimd, device, has_2d_block_io_support, bool, UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP) #ifdef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC_NEEDS_UNDEF #undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC #undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC_NEEDS_UNDEF diff --git a/sycl/include/sycl/property_list.hpp b/sycl/include/sycl/property_list.hpp index 59620f56a707c..8f019eb7cf22b 100644 --- a/sycl/include/sycl/property_list.hpp +++ b/sycl/include/sycl/property_list.hpp @@ -84,7 +84,8 @@ class PropertyValidator { static void checkPropsAndThrow(const property_list &PropList, std::function FunctionForDataless, std::function FunctionForData) { - PropList.checkPropsAndThrow(FunctionForDataless, FunctionForData); + PropList.checkPropsAndThrow(std::move(FunctionForDataless), + std::move(FunctionForData)); } }; } // namespace detail diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 9e530604ce84e..e4fce00ff0288 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -56,11 +56,8 @@ // these macros are #undef immediately. // replace _KERNELFUNCPARAM(KernelFunc) with KernelType KernelFunc // or const KernelType &KernelFunc -#ifdef __SYCL_NONCONST_FUNCTOR__ -#define _KERNELFUNCPARAM(a) KernelType a -#else + #define _KERNELFUNCPARAM(a) const KernelType &a -#endif namespace sycl { inline namespace _V1 { diff --git a/sycl/source/backend/opencl.cpp b/sycl/source/backend/opencl.cpp index e0c669cb8c267..17b08627039ab 100644 --- a/sycl/source/backend/opencl.cpp +++ b/sycl/source/backend/opencl.cpp @@ -17,12 +17,15 @@ namespace sycl { inline namespace _V1 { namespace opencl { -using namespace detail; //---------------------------------------------------------------------------- // Free functions to query OpenCL backend extensions + +namespace detail { +using namespace sycl::detail; + __SYCL_EXPORT bool has_extension(const sycl::platform &SyclPlatform, - const std::string &Extension) { + detail::string_view Extension) { if (SyclPlatform.get_backend() != sycl::backend::opencl) { throw sycl::exception( errc::backend_mismatch, @@ -50,11 +53,12 @@ __SYCL_EXPORT bool has_extension(const sycl::platform &SyclPlatform, nullptr); std::string_view ExtensionsString(Result.get()); - return ExtensionsString.find(Extension) != std::string::npos; + return ExtensionsString.find(std::string_view{Extension.data()}) != + std::string::npos; } __SYCL_EXPORT bool has_extension(const sycl::device &SyclDevice, - const std::string &Extension) { + detail::string_view Extension) { if (SyclDevice.get_backend() != sycl::backend::opencl) { throw sycl::exception( errc::backend_mismatch, @@ -82,8 +86,21 @@ __SYCL_EXPORT bool has_extension(const sycl::device &SyclDevice, ResultSize, Result.get(), nullptr); std::string_view ExtensionsString(Result.get()); - return ExtensionsString.find(Extension) != std::string::npos; + return ExtensionsString.find(std::string_view{Extension.data()}) != + std::string::npos; +} +} // namespace detail + +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES +__SYCL_EXPORT bool has_extension(const sycl::device &SyclDevice, + const std::string &Extension) { + return detail::has_extension(SyclDevice, detail::string_view{Extension}); +} +__SYCL_EXPORT bool has_extension(const sycl::platform &SyclPlatform, + const std::string &Extension) { + return detail::has_extension(SyclPlatform, detail::string_view{Extension}); } +#endif } // namespace opencl } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/context_impl.hpp b/sycl/source/detail/context_impl.hpp index 4fde73c8ba90c..65e1fda0a5a7a 100644 --- a/sycl/source/detail/context_impl.hpp +++ b/sycl/source/detail/context_impl.hpp @@ -109,7 +109,7 @@ class context_impl { /// be safely passed to the underlying native runtime API. Warning. Returned /// reference will be invalid if context_impl was destroyed. /// - /// \return an instance of raw plug-in context handle. + /// \return an instance of raw UR context handle. ur_context_handle_t &getHandleRef(); /// Gets the underlying context object (if any) without reference count @@ -119,7 +119,7 @@ class context_impl { /// be safely passed to the underlying native runtime API. Warning. Returned /// reference will be invalid if context_impl was destroyed. /// - /// \return an instance of raw plug-in context handle. + /// \return an instance of raw UR context handle. const ur_context_handle_t &getHandleRef() const; /// Unlike `get_info', this function returns a diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index 178634322f47e..205f5d14eada2 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -672,15 +672,16 @@ bool device_impl::has(aspect Aspect) const { } case aspect::ext_intel_matrix: { using arch = sycl::ext::oneapi::experimental::architecture; - const std::vector supported_archs = { + const arch supported_archs[] = { arch::intel_cpu_spr, arch::intel_cpu_gnr, arch::intel_gpu_pvc, arch::intel_gpu_dg2_g10, arch::intel_gpu_dg2_g11, arch::intel_gpu_dg2_g12, arch::intel_gpu_bmg_g21, arch::intel_gpu_lnl_m, - arch::intel_gpu_arl_h}; + arch::intel_gpu_arl_h, + }; try { return std::any_of( - supported_archs.begin(), supported_archs.end(), + std::begin(supported_archs), std::end(supported_archs), [=](const arch a) { return this->extOneapiArchitectureIs(a); }); } catch (const sycl::exception &) { // If we're here it means the device does not support architecture @@ -723,7 +724,8 @@ bool device_impl::has(aspect Aspect) const { UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS | UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE | UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE | - UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET; + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE; return has(aspect::ext_oneapi_limited_graph) && (UpdateCapabilities & RequiredCapabilities) == RequiredCapabilities; diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index 38e5988a37456..149d3f96eb0a5 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -1411,6 +1411,32 @@ typename Param::return_type get_device_info(const DeviceImplPtr &Dev) { return get_device_info_impl::get(Dev); } +template <> +inline typename info::device::preferred_interop_user_sync::return_type +get_device_info( + const DeviceImplPtr &Dev) { + if (Dev->getBackend() != backend::opencl) { + throw sycl::exception( + errc::invalid, + "the info::device::preferred_interop_user_sync info descriptor can " + "only be queried with an OpenCL backend"); + } + using Param = info::device::preferred_interop_user_sync; + return get_device_info_impl::get(Dev); +} + +template <> +inline typename info::device::profile::return_type +get_device_info(const DeviceImplPtr &Dev) { + if (Dev->getBackend() != backend::opencl) { + throw sycl::exception(errc::invalid, + "the info::device::profile info descriptor can " + "only be queried with an OpenCL backend"); + } + using Param = info::device::profile; + return get_device_info_impl::get(Dev); +} + template <> inline ext::intel::info::device::device_id::return_type get_device_info(const DeviceImplPtr &Dev) { @@ -1563,6 +1589,25 @@ get_device_info( return get_device_info_impl::get(Dev); } +template <> +inline ext::intel::esimd::info::device::has_2d_block_io_support::return_type +get_device_info( + const DeviceImplPtr &Dev) { + if (!Dev->has(aspect::ext_intel_esimd)) + return false; + + ur_exp_device_2d_block_array_capability_flags_t BlockArrayCapabilities; + Dev->getAdapter()->call( + Dev->getHandleRef(), + UrInfoCode< + ext::intel::esimd::info::device::has_2d_block_io_support>::value, + sizeof(BlockArrayCapabilities), &BlockArrayCapabilities, nullptr); + return (BlockArrayCapabilities & + UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_LOAD) && + (BlockArrayCapabilities & + UR_EXP_DEVICE_2D_BLOCK_ARRAY_CAPABILITY_FLAG_STORE); +} + // Returns the list of all progress guarantees that can be requested for // work_groups from the coordination level of root_group when using the device // given by Dev. First it calls getProgressGuarantee to get the strongest diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index f92d198d259d6..86e9b26b98084 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -58,12 +58,12 @@ class event_impl { SYCLConfig::get(); } - /// Constructs an event instance from a plug-in event handle. + /// Constructs an event instance from a UR event handle. /// - /// The SyclContext must match the plug-in context associated with the - /// ClEvent. + /// The SyclContext must match the UR context associated with the + /// ur_event_handle_t. /// - /// \param Event is a valid instance of plug-in event. + /// \param Event is a valid instance of UR event. /// \param SyclContext is an instance of SYCL context. event_impl(ur_event_handle_t Event, const context &SyclContext); event_impl(const QueueImplPtr &Queue); diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index 7118f5ddc8c54..0d2d2927f9799 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -1211,17 +1211,21 @@ sycl_device_binaries jit_compiler::compileSYCL( const std::vector &UserArgs, std::string *LogPtr, const std::vector &RegisteredKernelNames) { - // TODO: Handle template instantiation. - if (!RegisteredKernelNames.empty()) { - throw sycl::exception( - sycl::errc::build, - "Property `sycl::ext::oneapi::experimental::registered_kernel_names` " - "is not yet supported for the `sycl_jit` source language"); + // RegisteredKernelNames may contain template specializations, so we just put + // them in main() which ensures they are instantiated. + std::ostringstream ss; + ss << SYCLSource << '\n'; + ss << "int main() {\n"; + for (const std::string &KernelName : RegisteredKernelNames) { + ss << " (void)" << KernelName << ";\n"; } + ss << " return 0;\n}\n" << std::endl; + + std::string FinalSource = ss.str(); std::string SYCLFileName = Id + ".cpp"; ::jit_compiler::InMemoryFile SourceFile{SYCLFileName.c_str(), - SYCLSource.c_str()}; + FinalSource.c_str()}; std::vector<::jit_compiler::InMemoryFile> IncludeFilesView; IncludeFilesView.reserve(IncludePairs.size()); diff --git a/sycl/source/detail/kernel_impl.hpp b/sycl/source/detail/kernel_impl.hpp index 2dee6d9da151e..67fe2b155ea1e 100644 --- a/sycl/source/detail/kernel_impl.hpp +++ b/sycl/source/detail/kernel_impl.hpp @@ -34,7 +34,7 @@ class kernel_impl { public: /// Constructs a SYCL kernel instance from a UrKernel /// - /// This constructor is used for plug-in interoperability. It always marks + /// This constructor is used for UR adapter interoperability. It always marks /// kernel as being created from source. /// /// \param Kernel is a valid UrKernel instance diff --git a/sycl/source/detail/online_compiler/online_compiler.cpp b/sycl/source/detail/online_compiler/online_compiler.cpp index 5d3c3a381607b..344d32cd61ff5 100644 --- a/sycl/source/detail/online_compiler/online_compiler.cpp +++ b/sycl/source/detail/online_compiler/online_compiler.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +#define __SYCL_ONLINE_COMPILER_CPP + #include #include #include @@ -19,9 +21,11 @@ inline namespace _V1 { namespace ext::intel::experimental { namespace detail { +using namespace sycl::detail; + static std::vector prepareOclocArgs(sycl::info::device_type DeviceType, device_arch DeviceArch, - bool Is64Bit, const std::string &DeviceStepping, + bool Is64Bit, string_view DeviceStepping, const std::string &UserArgs) { std::vector Args = {"ocloc", "-q", "-spv_only", "-device"}; @@ -54,7 +58,7 @@ prepareOclocArgs(sycl::info::device_type DeviceType, device_arch DeviceArch, if (DeviceStepping != "") { Args.push_back("-revision_id"); - Args.push_back(DeviceStepping.c_str()); + Args.push_back(DeviceStepping.data()); } Args.push_back(Is64Bit ? "-64" : "-32"); @@ -82,11 +86,11 @@ prepareOclocArgs(sycl::info::device_type DeviceType, device_arch DeviceArch, /// allocated during the compilation. /// @param UserArgs - User's options to ocloc compiler. static std::vector -compileToSPIRV(const std::string &Source, sycl::info::device_type DeviceType, - device_arch DeviceArch, bool Is64Bit, - const std::string &DeviceStepping, void *&CompileToSPIRVHandle, - void *&FreeSPIRVOutputsHandle, +compileToSPIRV(string_view Src, sycl::info::device_type DeviceType, + device_arch DeviceArch, bool Is64Bit, string_view DeviceStepping, + void *&CompileToSPIRVHandle, void *&FreeSPIRVOutputsHandle, const std::vector &UserArgs) { + std::string Source{Src.data()}; if (!CompileToSPIRVHandle) { #ifdef __SYCL_RT_OS_WINDOWS @@ -198,11 +202,10 @@ compileToSPIRV(const std::string &Source, sycl::info::device_type DeviceType, } } // namespace detail -template <> -template <> -__SYCL_EXPORT std::vector -online_compiler::compile( - const std::string &Source, const std::vector &UserArgs) { +template +__SYCL_EXPORT std::vector online_compiler::compile_impl( + detail::string_view Src, detail::string_view DeviceStepping, + const std::vector &Options) { if (OutputFormatVersion != std::pair{0, 0}) { std::string Version = std::to_string(OutputFormatVersion.first) + ", " + @@ -211,29 +214,27 @@ online_compiler::compile( Version + ") is not supported yet"); } - return detail::compileToSPIRV(Source, DeviceType, DeviceArch, Is64Bit, + std::vector UserArgs; + for (auto &&Opt : Options) + UserArgs.emplace_back(Opt.data()); + + if constexpr (Lang == source_language::cm) + UserArgs.push_back("-cmc"); + + return detail::compileToSPIRV(Src, DeviceType, DeviceArch, Is64Bit, DeviceStepping, CompileToSPIRVHandle, FreeSPIRVOutputsHandle, UserArgs); } -template <> -template <> -__SYCL_EXPORT std::vector online_compiler::compile( - const std::string &Source, const std::vector &UserArgs) { +template __SYCL_EXPORT std::vector +online_compiler::compile_impl( + detail::string_view Src, detail::string_view DeviceStepping, + const std::vector &Options); - if (OutputFormatVersion != std::pair{0, 0}) { - std::string Version = std::to_string(OutputFormatVersion.first) + ", " + - std::to_string(OutputFormatVersion.second); - throw online_compile_error(std::string("The output format version (") + - Version + ") is not supported yet"); - } - - std::vector CMUserArgs = UserArgs; - CMUserArgs.push_back("-cmc"); - return detail::compileToSPIRV(Source, DeviceType, DeviceArch, Is64Bit, - DeviceStepping, CompileToSPIRVHandle, - FreeSPIRVOutputsHandle, CMUserArgs); -} +template __SYCL_EXPORT std::vector +online_compiler::compile_impl( + detail::string_view Src, detail::string_view DeviceStepping, + const std::vector &Options); } // namespace ext::intel::experimental namespace ext { diff --git a/sycl/source/detail/os_util.cpp b/sycl/source/detail/os_util.cpp index c8ee3b8f33c11..018ba97cff05c 100644 --- a/sycl/source/detail/os_util.cpp +++ b/sycl/source/detail/os_util.cpp @@ -69,6 +69,20 @@ bool procMapsAddressInRange(std::istream &Stream, uintptr_t Addr) { return Addr >= Start && Addr < End; } +#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) +static std::string getDirName(const char *Path) +#else +std::string OSUtil::getDirName(const char *Path) +#endif +{ + std::string Tmp(Path); + // dirname(3) needs a writable C string: a null-terminator is written where a + // path should split. + size_t TruncatedSize = strlen(dirname(const_cast(Tmp.c_str()))); + Tmp.resize(TruncatedSize); + return Tmp; +} + /// Returns an absolute path to a directory where the object was found. std::string OSUtil::getCurrentDSODir() { // Examine /proc/self/maps and find where this function (getCurrendDSODir) @@ -130,21 +144,12 @@ std::string OSUtil::getCurrentDSODir() { char Path[PATH_MAX]; Stream.getline(Path, PATH_MAX - 1); Path[PATH_MAX - 1] = '\0'; - return OSUtil::getDirName(Path); + return getDirName(Path); } assert(false && "Unable to find the current function in /proc/self/maps"); return ""; } -std::string OSUtil::getDirName(const char *Path) { - std::string Tmp(Path); - // dirname(3) needs a writable C string: a null-terminator is written where a - // path should split. - size_t TruncatedSize = strlen(dirname(const_cast(Tmp.c_str()))); - Tmp.resize(TruncatedSize); - return Tmp; -} - #elif defined(__SYCL_RT_OS_WINDOWS) /// Returns an absolute path where the object was found. @@ -169,6 +174,7 @@ std::string OSUtil::getCurrentDSODir() { return Path; } +#if !defined(__INTEL_PREVIEW_BREAKING_CHANGES) std::string OSUtil::getDirName(const char *Path) { std::string Tmp(Path); // Remove trailing directory separators @@ -181,6 +187,7 @@ std::string OSUtil::getDirName(const char *Path) { // If no directory separator is present return initial path like dirname does return Tmp; } +#endif #elif defined(__SYCL_RT_OS_DARWIN) std::string OSUtil::getCurrentDSODir() { diff --git a/sycl/source/detail/persistent_device_code_cache.hpp b/sycl/source/detail/persistent_device_code_cache.hpp index d2038aaa65969..78441a251aa75 100644 --- a/sycl/source/detail/persistent_device_code_cache.hpp +++ b/sycl/source/detail/persistent_device_code_cache.hpp @@ -24,10 +24,6 @@ namespace sycl { inline namespace _V1 { namespace detail { -/* This is temporary solution until std::filesystem is available when SYCL RT - * is moved to c++17 standard*/ -std::string getDirName(const char *Path); - /* The class manages inter-process synchronization: * - Path passed to the constructor is appended with .lock and used as lock * file. diff --git a/sycl/source/detail/pipes.cpp b/sycl/source/detail/pipes.cpp index 99b0d292676ff..42884e7aaa77f 100644 --- a/sycl/source/detail/pipes.cpp +++ b/sycl/source/detail/pipes.cpp @@ -6,6 +6,10 @@ // //===----------------------------------------------------------------------===// +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES +#define __SYCL_PIPES_CPP +#endif + #include #include #include @@ -16,11 +20,11 @@ inline namespace _V1 { namespace ext { namespace intel { namespace experimental { - -__SYCL_EXPORT std::string pipe_base::get_pipe_name(const void *HostPipePtr) { - return sycl::_V1::detail::ProgramManager::getInstance() - .getHostPipeEntry(HostPipePtr) - ->MUniqueId; +__SYCL_EXPORT sycl::detail::string +pipe_base::get_pipe_name_impl(const void *HostPipePtr) { + return {std::string_view{sycl::_V1::detail::ProgramManager::getInstance() + .getHostPipeEntry(HostPipePtr) + ->MUniqueId}}; } __SYCL_EXPORT bool pipe_base::wait_non_blocking(const event &E) { diff --git a/sycl/source/detail/platform_impl.hpp b/sycl/source/detail/platform_impl.hpp index e7e76334efaa6..45d38c9c661f6 100644 --- a/sycl/source/detail/platform_impl.hpp +++ b/sycl/source/detail/platform_impl.hpp @@ -33,8 +33,7 @@ class device_impl; // TODO: implement parameters treatment for host device class platform_impl { public: - /// Constructs platform_impl from a plug-in interoperability platform - /// handle. + /// Constructs platform_impl from a UR platform handle. /// /// \param APlatform is a raw plug-in platform handle. /// \param AAdapter is a plug-in handle. @@ -107,7 +106,7 @@ class platform_impl { return ur::cast(nativeHandle); } - /// Returns raw underlying plug-in platform handle. + /// Returns raw underlying UR platform handle. /// /// Unlike get() method, this method does not retain handler. It is caller /// responsibility to make sure that platform stays alive while raw handle diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index c3bb1e10eaea2..a7e2d91c360b6 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -732,9 +732,6 @@ static void setSpecializationConstants(const std::shared_ptr &InputImpl, ur_program_handle_t Prog, const AdapterPtr &Adapter) { - // Set ITT annotation specialization constant if needed. - enableITTAnnotationsIfNeeded(Prog, Adapter); - std::lock_guard Lock{InputImpl->get_spec_const_data_lock()}; const std::map> &SpecConstData = InputImpl->get_spec_const_data_ref(); @@ -769,15 +766,6 @@ ur_program_handle_t ProgramManager::getBuiltURProgram( const ContextImplPtr &ContextImpl, const DeviceImplPtr &DeviceImpl, const std::string &KernelName, const NDRDescT &NDRDesc, bool JITCompilationIsRequired) { - KernelProgramCache &Cache = ContextImpl->getKernelProgramCache(); - - std::string CompileOpts; - std::string LinkOpts; - - applyOptionsFromEnvironment(CompileOpts, LinkOpts); - - SerializedObj SpecConsts; - // Check if we can optimize program builds for sub-devices by using a program // built for the root device DeviceImplPtr RootDevImpl = DeviceImpl; @@ -824,19 +812,36 @@ ur_program_handle_t ProgramManager::getBuiltURProgram( AllImages.push_back(&Img); std::copy(ImageDeps.begin(), ImageDeps.end(), std::back_inserter(AllImages)); - auto BuildF = [this, &Img, &Context, &ContextImpl, &Device, &CompileOpts, + return getBuiltURProgram(Img, Context, {Device}, DeviceImagesToLink, + AllImages); +} + +ur_program_handle_t ProgramManager::getBuiltURProgram( + const RTDeviceBinaryImage &Img, const context &Context, + const std::vector &Devs, + const std::set &DeviceImagesToLink, + const std::vector &AllImages, + const std::shared_ptr &DeviceImageImpl, + const SerializedObj &SpecConsts) { + std::string CompileOpts; + std::string LinkOpts; + applyOptionsFromEnvironment(CompileOpts, LinkOpts); + auto BuildF = [this, &Img, &DeviceImageImpl, &Context, &Devs, &CompileOpts, &LinkOpts, SpecConsts, &DeviceImagesToLink, &AllImages] { + const ContextImplPtr &ContextImpl = getSyclObjImpl(Context); const AdapterPtr &Adapter = ContextImpl->getAdapter(); - applyOptionsFromImage(CompileOpts, LinkOpts, Img, {Device}, Adapter); + applyOptionsFromImage(CompileOpts, LinkOpts, Img, Devs, Adapter); // Should always come last! appendCompileEnvironmentVariablesThatAppend(CompileOpts); appendLinkEnvironmentVariablesThatAppend(LinkOpts); + auto [NativePrg, DeviceCodeWasInCache] = getOrCreateURProgram( - Img, AllImages, Context, {Device}, CompileOpts + LinkOpts, SpecConsts); + Img, {AllImages}, Context, Devs, CompileOpts + LinkOpts, SpecConsts); - if (!DeviceCodeWasInCache) { - if (Img.supportsSpecConstants()) - enableITTAnnotationsIfNeeded(NativePrg, Adapter); + if (!DeviceCodeWasInCache && Img.supportsSpecConstants()) { + enableITTAnnotationsIfNeeded(NativePrg, Adapter); + if (DeviceImageImpl) + setSpecializationConstants(DeviceImageImpl, NativePrg, Adapter); } UrFuncInfo programReleaseInfo; @@ -864,34 +869,28 @@ ur_program_handle_t ProgramManager::getBuiltURProgram( for (RTDeviceBinaryImage *BinImg : DeviceImagesToLink) { if (UseDeviceLibs) DeviceLibReqMask |= getDeviceLibReqMask(*BinImg); - device_image_plain DevImagePlain = - getDeviceImageFromBinaryImage(BinImg, Context, Device); - const std::shared_ptr &DeviceImageImpl = - detail::getSyclObjImpl(DevImagePlain); - - SerializedObj ImgSpecConsts = - DeviceImageImpl->get_spec_const_blob_ref(); - ur_program_handle_t NativePrg = - createURProgram(*BinImg, Context, {Device}); + ur_program_handle_t NativePrg = createURProgram(*BinImg, Context, Devs); if (BinImg->supportsSpecConstants()) - setSpecializationConstants(DeviceImageImpl, NativePrg, Adapter); + enableITTAnnotationsIfNeeded(NativePrg, Adapter); ProgramsToLink.push_back(NativePrg); } } - std::vector Devs = { - getSyclObjImpl(Device).get()->getHandleRef()}; - ; + + std::vector URDevices; + for (auto Dev : Devs) + URDevices.push_back(getSyclObjImpl(Dev).get()->getHandleRef()); + ProgramPtr BuiltProgram = build( - std::move(ProgramManaged), ContextImpl, CompileOpts, LinkOpts, Devs, - DeviceLibReqMask, ProgramsToLink, + std::move(ProgramManaged), ContextImpl, CompileOpts, LinkOpts, + URDevices, DeviceLibReqMask, ProgramsToLink, /*CreatedFromBinary*/ Img.getFormat() != SYCL_DEVICE_BINARY_TYPE_SPIRV); + // Those extra programs won't be used anymore, just the final linked result for (ur_program_handle_t Prg : ProgramsToLink) Adapter->call(Prg); - emitBuiltProgramInfo(BuiltProgram.get(), ContextImpl); { @@ -902,29 +901,37 @@ ur_program_handle_t ProgramManager::getBuiltURProgram( } } - ContextImpl->addDeviceGlobalInitializer(BuiltProgram.get(), {Device}, &Img); + ContextImpl->addDeviceGlobalInitializer(BuiltProgram.get(), Devs, &Img); // Save program to persistent cache if it is not there if (!DeviceCodeWasInCache) { - PersistentDeviceCodeCache::putItemToDisc({Device}, AllImages, SpecConsts, + PersistentDeviceCodeCache::putItemToDisc(Devs, AllImages, SpecConsts, CompileOpts + LinkOpts, BuiltProgram.get()); } + return BuiltProgram.release(); }; + if (!SYCLConfig::get()) + return BuildF(); + uint32_t ImgId = Img.getImageID(); - const ur_device_handle_t UrDevice = Dev->getHandleRef(); - auto CacheKey = std::make_pair(std::make_pair(std::move(SpecConsts), ImgId), - std::set{UrDevice}); + std::set URDevicesSet; + std::transform(Devs.begin(), Devs.end(), + std::inserter(URDevicesSet, URDevicesSet.begin()), + [](const device &Dev) { + return getSyclObjImpl(Dev).get()->getHandleRef(); + }); + auto CacheKey = + std::make_pair(std::make_pair(SpecConsts, ImgId), URDevicesSet); + const ContextImplPtr &ContextImpl = getSyclObjImpl(Context); + KernelProgramCache &Cache = ContextImpl->getKernelProgramCache(); auto GetCachedBuildF = [&Cache, &CacheKey]() { return Cache.getOrInsertProgram(CacheKey); }; - if (!SYCLConfig::get()) - return BuildF(); - auto EvictFunc = [&Cache, &CacheKey](ur_program_handle_t Program, bool isBuilt) { return Cache.registerProgramFetch(CacheKey, Program, isBuilt); @@ -932,35 +939,69 @@ ur_program_handle_t ProgramManager::getBuiltURProgram( auto BuildResult = Cache.getOrBuild(GetCachedBuildF, BuildF, EvictFunc); + // getOrBuild is not supposed to return nullptr assert(BuildResult != nullptr && "Invalid build result"); ur_program_handle_t ResProgram = BuildResult->Val; - auto Adapter = ContextImpl->getAdapter(); + // Here we have multiple devices a program is built for, so add the program to + // the cache for all subsets of provided list of devices. + const AdapterPtr &Adapter = ContextImpl->getAdapter(); // If we linked any extra device images, then we need to // cache them as well. - for (const RTDeviceBinaryImage *BImg : DeviceImagesToLink) { - // CacheKey is captured by reference by GetCachedBuildF, so we can simply - // update it here and re-use that lambda. - CacheKey.first.second = BImg->getImageID(); - bool DidInsert = Cache.insertBuiltProgram(CacheKey, ResProgram); + auto CacheLinkedImages = [&Adapter, &Cache, &CacheKey, &ResProgram, + &DeviceImagesToLink] { + for (const RTDeviceBinaryImage *BImg : DeviceImagesToLink) { + // CacheKey is captured by reference by GetCachedBuildF, so we can simply + // update it here and re-use that lambda. + CacheKey.first.second = BImg->getImageID(); + bool DidInsert = Cache.insertBuiltProgram(CacheKey, ResProgram); + // Add to the eviction list. + Cache.registerProgramFetch(CacheKey, ResProgram, DidInsert); + if (DidInsert) { + // For every cached copy of the program, we need to increment its + // refcount + Adapter->call(ResProgram); + } + } + }; + CacheLinkedImages(); - // Add to the eviction list. - Cache.registerProgramFetch(CacheKey, ResProgram, DidInsert); - if (DidInsert) - // For every cached copy of the program, we need to increment its refcount - Adapter->call(ResProgram); + if (URDevicesSet.size() > 1) { + // emplace all subsets of the current set of devices into the cache. + // Set of all devices is not included in the loop as it was already added + // into the cache. + for (int Mask = 1; Mask < (1 << URDevicesSet.size()) - 1; ++Mask) { + std::set Subset; + int Index = 0; + for (auto It = URDevicesSet.begin(); It != URDevicesSet.end(); + ++It, ++Index) { + if (Mask & (1 << Index)) { + Subset.insert(*It); + } + } + // Change device in the cache key to reduce copying of spec const data. + CacheKey.second = Subset; + bool DidInsert = Cache.insertBuiltProgram(CacheKey, ResProgram); + if (DidInsert) { + // For every cached copy of the program, we need to increment its + // refcount + Adapter->call(ResProgram); + } + CacheLinkedImages(); + // getOrBuild is not supposed to return nullptr + assert(BuildResult != nullptr && "Invalid build result"); + } } // If caching is enabled, one copy of the program handle will be // stored in the cache, and one handle is returned to the // caller. In that case, we need to increase the ref count of the // program. - ContextImpl->getAdapter()->call(ResProgram); + Adapter->call(ResProgram); return ResProgram; } - // When caching is enabled, the returned UrProgram and UrKernel will // already have their ref count incremented. std::tupleget_context())->getAdapter(); - // Device is not used when creating program from SPIRV, so passing only one - // device is OK. ur_program_handle_t Prog = createURProgram(*InputImpl->get_bin_image_ref(), InputImpl->get_context(), Devs); @@ -2625,160 +2664,23 @@ device_image_plain ProgramManager::build(const device_image_plain &DeviceImage, getSyclObjImpl(DeviceImage); const context Context = InputImpl->get_context(); - const ContextImplPtr ContextImpl = getSyclObjImpl(Context); - KernelProgramCache &Cache = ContextImpl->getKernelProgramCache(); - - std::string CompileOpts; - std::string LinkOpts; - applyOptionsFromEnvironment(CompileOpts, LinkOpts); - const RTDeviceBinaryImage *ImgPtr = InputImpl->get_bin_image_ref(); const RTDeviceBinaryImage &Img = *ImgPtr; SerializedObj SpecConsts = InputImpl->get_spec_const_blob_ref(); - // TODO: Unify this code with getBuiltPIProgram - auto BuildF = [this, &Context, &Img, &Devs, &CompileOpts, &LinkOpts, - &InputImpl, SpecConsts] { - ContextImplPtr ContextImpl = getSyclObjImpl(Context); - const AdapterPtr &Adapter = ContextImpl->getAdapter(); - applyOptionsFromImage(CompileOpts, LinkOpts, Img, Devs, Adapter); - // Should always come last! - appendCompileEnvironmentVariablesThatAppend(CompileOpts); - appendLinkEnvironmentVariablesThatAppend(LinkOpts); - - // Device is not used when creating program from SPIRV, so passing only one - // device is OK. - auto [NativePrg, DeviceCodeWasInCache] = getOrCreateURProgram( - Img, {&Img}, Context, Devs, CompileOpts + LinkOpts, SpecConsts); - - if (!DeviceCodeWasInCache && - InputImpl->get_bin_image_ref()->supportsSpecConstants()) - setSpecializationConstants(InputImpl, NativePrg, Adapter); - - UrFuncInfo programReleaseInfo; - auto programRelease = - programReleaseInfo.getFuncPtrFromModule(ur::getURLoaderLibrary()); - ProgramPtr ProgramManaged(NativePrg, programRelease); - - // Link a fallback implementation of device libraries if they are not - // supported by a device compiler. - // Pre-compiled programs are supposed to be already linked. - // If device image is not SPIR-V, DeviceLibReqMask will be 0 which means - // no fallback device library will be linked. - uint32_t DeviceLibReqMask = 0; - if (Img.getFormat() == SYCL_DEVICE_BINARY_TYPE_SPIRV && - !SYCLConfig::get()) - DeviceLibReqMask = getDeviceLibReqMask(Img); - - // TODO: Add support for dynamic linking with kernel bundles - std::vector ExtraProgramsToLink; - std::vector URDevices; - for (auto Dev : Devs) { - URDevices.push_back(getSyclObjImpl(Dev).get()->getHandleRef()); - } - ProgramPtr BuiltProgram = - build(std::move(ProgramManaged), ContextImpl, CompileOpts, LinkOpts, - URDevices, DeviceLibReqMask, ExtraProgramsToLink); - - emitBuiltProgramInfo(BuiltProgram.get(), ContextImpl); - - { - std::lock_guard Lock(MNativeProgramsMutex); - NativePrograms.insert({BuiltProgram.get(), &Img}); - } - - ContextImpl->addDeviceGlobalInitializer(BuiltProgram.get(), Devs, &Img); - - // Save program to persistent cache if it is not there - if (!DeviceCodeWasInCache) - PersistentDeviceCodeCache::putItemToDisc( - Devs, {&Img}, SpecConsts, CompileOpts + LinkOpts, BuiltProgram.get()); - - return BuiltProgram.release(); - }; - - if (!SYCLConfig::get()) { - auto ResProgram = BuildF(); - DeviceImageImplPtr ExecImpl = std::make_shared( - InputImpl->get_bin_image_ref(), Context, Devs, bundle_state::executable, - InputImpl->get_kernel_ids_ptr(), ResProgram, - InputImpl->get_spec_const_data_ref(), - InputImpl->get_spec_const_blob_ref()); - - return createSyclObjFromImpl(ExecImpl); - } - - uint32_t ImgId = Img.getImageID(); - std::set URDevicesSet; - std::transform(Devs.begin(), Devs.end(), - std::inserter(URDevicesSet, URDevicesSet.begin()), - [](const device &Dev) { - return getSyclObjImpl(Dev).get()->getHandleRef(); - }); - auto CacheKey = std::make_pair(std::make_pair(std::move(SpecConsts), ImgId), - URDevicesSet); - - // CacheKey is captured by reference so when we overwrite it later we can - // reuse this function. - auto GetCachedBuildF = [&Cache, &CacheKey]() { - return Cache.getOrInsertProgram(CacheKey); - }; - - auto EvictFunc = [&Cache, &CacheKey](ur_program_handle_t Program, - bool isBuilt) { - return Cache.registerProgramFetch(CacheKey, Program, isBuilt); - }; - - auto BuildResult = - Cache.getOrBuild(GetCachedBuildF, BuildF, EvictFunc); - // getOrBuild is not supposed to return nullptr - assert(BuildResult != nullptr && "Invalid build result"); - - ur_program_handle_t ResProgram = BuildResult->Val; - - // Here we have multiple devices a program is built for, so add the program to - // the cache for all subsets of provided list of devices. - const AdapterPtr &Adapter = ContextImpl->getAdapter(); - auto CacheSubsets = [ResProgram, &Adapter]() { - Adapter->call(ResProgram); - return ResProgram; - }; - - if (URDevicesSet.size() > 1) { - // emplace all subsets of the current set of devices into the cache. - // Set of all devices is not included in the loop as it was already added - // into the cache. - for (int Mask = 1; Mask < (1 << URDevicesSet.size()) - 1; ++Mask) { - std::set Subset; - int Index = 0; - for (auto It = URDevicesSet.begin(); It != URDevicesSet.end(); - ++It, ++Index) { - if (Mask & (1 << Index)) { - Subset.insert(*It); - } - } - // Change device in the cache key to reduce copying of spec const data. - CacheKey.second = Subset; - Cache.getOrBuild(GetCachedBuildF, CacheSubsets, EvictFunc); - // getOrBuild is not supposed to return nullptr - assert(BuildResult != nullptr && "Invalid build result"); - } - } - - // devive_image_impl shares ownership of PIProgram with, at least, program - // cache. The ref counter will be descremented in the destructor of - // device_image_impl - Adapter->call(ResProgram); + // TODO: Add support for dynamic linking with kernel bundles + ur_program_handle_t ResProgram = + getBuiltURProgram(Img, Context, Devs, /*DeviceImagesToLink*/ {}, {&Img}, + InputImpl, SpecConsts); DeviceImageImplPtr ExecImpl = std::make_shared( InputImpl->get_bin_image_ref(), Context, Devs, bundle_state::executable, InputImpl->get_kernel_ids_ptr(), ResProgram, InputImpl->get_spec_const_data_ref(), InputImpl->get_spec_const_blob_ref()); - return createSyclObjFromImpl(ExecImpl); } diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp index 60d43a87e3fa6..8f228dd772fb0 100644 --- a/sycl/source/detail/program_manager/program_manager.hpp +++ b/sycl/source/detail/program_manager/program_manager.hpp @@ -147,11 +147,25 @@ class ProgramManager { const NDRDescT &NDRDesc = {}, bool JITCompilationIsRequired = false); - ur_program_handle_t getBuiltURProgram(const context &Context, - const device &Device, - const std::string &KernelName, - const property_list &PropList, - bool JITCompilationIsRequired = false); + /// Builds a program from a given set of images or retrieves that program from + /// cache. + /// \param Img is the main image the program is built with. + /// \param Context is the context the program is built for. + /// \param Devs is a vector of devices the program is built for. + /// \param DeviceImagesToLink is a set of image dependencies required by the + /// main image. + /// \param AllImages is a vector of all images the program is built with. + /// \param DeviceImageImpl is an optional device_image_impl pointer that + /// represents the main image. + /// \param SpecConsts is an optional parameter containing spec constant values + /// the program should be built with. + ur_program_handle_t getBuiltURProgram( + const RTDeviceBinaryImage &Img, const context &Context, + const std::vector &Devs, + const std::set &DeviceImagesToLink, + const std::vector &AllImages, + const std::shared_ptr &DeviceImageImpl = nullptr, + const SerializedObj &SpecConsts = {}); std::tuple diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 574e0ae064afb..84c2af5bdfabe 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -3613,10 +3613,21 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { return UR_RESULT_SUCCESS; } - case CGType::None: - throw sycl::exception(sycl::make_error_code(sycl::errc::runtime), - "CG type not implemented. " + - codeToString(UR_RESULT_ERROR_INVALID_OPERATION)); + case CGType::None: { + if (RawEvents.empty()) { + // urEnqueueEventsWait with zero events acts like a barrier which is NOT + // what we want here. On the other hand, there is nothing to wait for, so + // we don't need to enqueue anything. + return UR_RESULT_SUCCESS; + } + const detail::AdapterPtr &Adapter = MQueue->getAdapter(); + ur_event_handle_t Event; + ur_result_t Result = Adapter->call_nocheck( + MQueue->getHandleRef(), RawEvents.size(), + RawEvents.size() ? &RawEvents[0] : nullptr, &Event); + MEvent->setHandle(Event); + return Result; + } } return UR_RESULT_ERROR_INVALID_OPERATION; } diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 476a6e373f2db..7cdfa2ab48fdb 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -498,21 +498,8 @@ event handler::finalize() { MCodeLoc)); break; case detail::CGType::None: - if (detail::ur::trace(detail::ur::TraceLevel::TRACE_ALL)) { - std::cout << "WARNING: An empty command group is submitted." << std::endl; - } - - // Empty nodes are handled by Graph like standard nodes - // For Standard mode (non-graph), - // empty nodes are not sent to the scheduler to save time - if (impl->MGraph || (MQueue && MQueue->getCommandGraph())) { - CommandGroup.reset(new detail::CG(detail::CGType::None, - std::move(impl->CGData), MCodeLoc)); - } else { - detail::EventImplPtr Event = std::make_shared(); - MLastEvent = detail::createSyclObjFromImpl(Event); - return MLastEvent; - } + CommandGroup.reset(new detail::CG(detail::CGType::None, + std::move(impl->CGData), MCodeLoc)); break; } diff --git a/sycl/test-e2e/Plugin/adapter-release.cpp b/sycl/test-e2e/Adapters/adapter-release.cpp similarity index 100% rename from sycl/test-e2e/Plugin/adapter-release.cpp rename to sycl/test-e2e/Adapters/adapter-release.cpp diff --git a/sycl/test-e2e/Plugin/cuda-max-local-mem-size.cpp b/sycl/test-e2e/Adapters/cuda-max-local-mem-size.cpp similarity index 100% rename from sycl/test-e2e/Plugin/cuda-max-local-mem-size.cpp rename to sycl/test-e2e/Adapters/cuda-max-local-mem-size.cpp diff --git a/sycl/test-e2e/Plugin/cuda_queue_priority.cpp b/sycl/test-e2e/Adapters/cuda_queue_priority.cpp similarity index 100% rename from sycl/test-e2e/Plugin/cuda_queue_priority.cpp rename to sycl/test-e2e/Adapters/cuda_queue_priority.cpp diff --git a/sycl/test-e2e/Plugin/dll-detach-order.cpp b/sycl/test-e2e/Adapters/dll-detach-order.cpp similarity index 77% rename from sycl/test-e2e/Plugin/dll-detach-order.cpp rename to sycl/test-e2e/Adapters/dll-detach-order.cpp index e6e45f6dc4db5..72d014eb066bb 100644 --- a/sycl/test-e2e/Plugin/dll-detach-order.cpp +++ b/sycl/test-e2e/Adapters/dll-detach-order.cpp @@ -2,7 +2,8 @@ // REQUIRES: build-and-run-mode // RUN: env SYCL_UR_TRACE=-1 sycl-ls | FileCheck %s -// ensure that the plugins are detached AFTER urLoaderTearDown is done executing +// ensure that the adapters are detached AFTER urLoaderTearDown is done +// executing // CHECK: ---> DLL_PROCESS_DETACH syclx.dll diff --git a/sycl/test-e2e/Plugin/enqueue-arg-order-buffer.cpp b/sycl/test-e2e/Adapters/enqueue-arg-order-buffer.cpp similarity index 99% rename from sycl/test-e2e/Plugin/enqueue-arg-order-buffer.cpp rename to sycl/test-e2e/Adapters/enqueue-arg-order-buffer.cpp index ee744efe1bf47..468069e275a29 100644 --- a/sycl/test-e2e/Plugin/enqueue-arg-order-buffer.cpp +++ b/sycl/test-e2e/Adapters/enqueue-arg-order-buffer.cpp @@ -415,7 +415,7 @@ int main() { // CHECK-LABEL: start copyH2D-buffer // CHECK: <--- urEnqueueMemBufferWrite({{.*}} .size = 64, -// CHECK: <--- urEnqueueMemBufferWriteRect({{.*}} .region = (struct ur_rect_region_t){.width = 64, .height = 5, .depth = 1}, .bufferRowPitch = 64, .bufferSlicePitch = 0, .hostRowPitch = 64, +// CHECK: <--- urEnqueueMemBufferWriteRect({{.*}} .region = (struct ur_rect_region_t){.width = 64, .height = 5, .depth = 1}, .bufferRowPitch = 64, .bufferSlicePitch = 0, .hostRowPitch = 64, // CHECK: <--- urEnqueueMemBufferWriteRect({{.*}} .region = (struct ur_rect_region_t){.width = 64, .height = 5, .depth = 3}, .bufferRowPitch = 64, .bufferSlicePitch = 320, .hostRowPitch = 64, .hostSlicePitch = 320, // CHECK: end copyH2D-buffer diff --git a/sycl/test-e2e/Plugin/enqueue-arg-order-image.cpp b/sycl/test-e2e/Adapters/enqueue-arg-order-image.cpp similarity index 100% rename from sycl/test-e2e/Plugin/enqueue-arg-order-image.cpp rename to sycl/test-e2e/Adapters/enqueue-arg-order-image.cpp diff --git a/sycl/test-e2e/Plugin/interop-cuda-experimental.cpp b/sycl/test-e2e/Adapters/interop-cuda-experimental.cpp similarity index 100% rename from sycl/test-e2e/Plugin/interop-cuda-experimental.cpp rename to sycl/test-e2e/Adapters/interop-cuda-experimental.cpp diff --git a/sycl/test-e2e/Plugin/interop-device.cpp b/sycl/test-e2e/Adapters/interop-device.cpp similarity index 100% rename from sycl/test-e2e/Plugin/interop-device.cpp rename to sycl/test-e2e/Adapters/interop-device.cpp diff --git a/sycl/test-e2e/Plugin/interop-experimental-single-TU-SYCL-CUDA-compilation.cpp b/sycl/test-e2e/Adapters/interop-experimental-single-TU-SYCL-CUDA-compilation.cpp similarity index 100% rename from sycl/test-e2e/Plugin/interop-experimental-single-TU-SYCL-CUDA-compilation.cpp rename to sycl/test-e2e/Adapters/interop-experimental-single-TU-SYCL-CUDA-compilation.cpp diff --git a/sycl/test-e2e/Plugin/interop-l0-direct.cpp b/sycl/test-e2e/Adapters/interop-l0-direct.cpp similarity index 100% rename from sycl/test-e2e/Plugin/interop-l0-direct.cpp rename to sycl/test-e2e/Adapters/interop-l0-direct.cpp diff --git a/sycl/test-e2e/Plugin/interop-level-zero-buffer-helpers.hpp b/sycl/test-e2e/Adapters/interop-level-zero-buffer-helpers.hpp similarity index 100% rename from sycl/test-e2e/Plugin/interop-level-zero-buffer-helpers.hpp rename to sycl/test-e2e/Adapters/interop-level-zero-buffer-helpers.hpp diff --git a/sycl/test-e2e/Plugin/interop-level-zero-buffer-multi-dim.cpp b/sycl/test-e2e/Adapters/interop-level-zero-buffer-multi-dim.cpp similarity index 100% rename from sycl/test-e2e/Plugin/interop-level-zero-buffer-multi-dim.cpp rename to sycl/test-e2e/Adapters/interop-level-zero-buffer-multi-dim.cpp diff --git a/sycl/test-e2e/Plugin/interop-level-zero-buffer-ownership.cpp b/sycl/test-e2e/Adapters/interop-level-zero-buffer-ownership.cpp similarity index 98% rename from sycl/test-e2e/Plugin/interop-level-zero-buffer-ownership.cpp rename to sycl/test-e2e/Adapters/interop-level-zero-buffer-ownership.cpp index 7fa9c9895a38e..c1e72929ad489 100644 --- a/sycl/test-e2e/Plugin/interop-level-zero-buffer-ownership.cpp +++ b/sycl/test-e2e/Adapters/interop-level-zero-buffer-ownership.cpp @@ -1,5 +1,5 @@ // REQUIRES: gpu, level_zero, level_zero_dev_kit -// L0 plugin incorrectly reports memory leaks because it doesn't take into +// L0 adapter incorrectly reports memory leaks because it doesn't take into // account direct calls to L0 API. // UNSUPPORTED: ze_debug // RUN: %{build} %level_zero_options -o %t.out diff --git a/sycl/test-e2e/Plugin/interop-level-zero-buffer.cpp b/sycl/test-e2e/Adapters/interop-level-zero-buffer.cpp similarity index 99% rename from sycl/test-e2e/Plugin/interop-level-zero-buffer.cpp rename to sycl/test-e2e/Adapters/interop-level-zero-buffer.cpp index 9ec7f00a10819..897e854a25df7 100644 --- a/sycl/test-e2e/Plugin/interop-level-zero-buffer.cpp +++ b/sycl/test-e2e/Adapters/interop-level-zero-buffer.cpp @@ -1,5 +1,5 @@ // REQUIRES: level_zero, level_zero_dev_kit -// L0 plugin incorrectly reports memory leaks because it doesn't take into +// L0 adapter incorrectly reports memory leaks because it doesn't take into // account direct calls to L0 API. // UNSUPPORTED: ze_debug // RUN: %{build} -Wno-error=deprecated-declarations %level_zero_options -o %t.out diff --git a/sycl/test-e2e/Plugin/interop-level-zero-device.cpp b/sycl/test-e2e/Adapters/interop-level-zero-device.cpp similarity index 100% rename from sycl/test-e2e/Plugin/interop-level-zero-device.cpp rename to sycl/test-e2e/Adapters/interop-level-zero-device.cpp diff --git a/sycl/test-e2e/Plugin/interop-level-zero-get-native-mem.cpp b/sycl/test-e2e/Adapters/interop-level-zero-get-native-mem.cpp similarity index 98% rename from sycl/test-e2e/Plugin/interop-level-zero-get-native-mem.cpp rename to sycl/test-e2e/Adapters/interop-level-zero-get-native-mem.cpp index 1f9bfd5b5f239..5a59e7d100306 100644 --- a/sycl/test-e2e/Plugin/interop-level-zero-get-native-mem.cpp +++ b/sycl/test-e2e/Adapters/interop-level-zero-get-native-mem.cpp @@ -1,5 +1,5 @@ // REQUIRES: level_zero, level_zero_dev_kit -// L0 plugin incorrectly reports memory leaks because it doesn't take into +// L0 adapter incorrectly reports memory leaks because it doesn't take into // account direct calls to L0 API. // UNSUPPORTED: ze_debug // RUN: %{build} %level_zero_options -o %t.out diff --git a/sycl/test-e2e/Plugin/interop-level-zero-image-get-native-mem.cpp b/sycl/test-e2e/Adapters/interop-level-zero-image-get-native-mem.cpp similarity index 100% rename from sycl/test-e2e/Plugin/interop-level-zero-image-get-native-mem.cpp rename to sycl/test-e2e/Adapters/interop-level-zero-image-get-native-mem.cpp diff --git a/sycl/test-e2e/Plugin/interop-level-zero-image-ownership.cpp b/sycl/test-e2e/Adapters/interop-level-zero-image-ownership.cpp similarity index 100% rename from sycl/test-e2e/Plugin/interop-level-zero-image-ownership.cpp rename to sycl/test-e2e/Adapters/interop-level-zero-image-ownership.cpp diff --git a/sycl/test-e2e/Plugin/interop-level-zero-image.cpp b/sycl/test-e2e/Adapters/interop-level-zero-image.cpp similarity index 99% rename from sycl/test-e2e/Plugin/interop-level-zero-image.cpp rename to sycl/test-e2e/Adapters/interop-level-zero-image.cpp index 8b1febce40c28..019a63b470538 100644 --- a/sycl/test-e2e/Plugin/interop-level-zero-image.cpp +++ b/sycl/test-e2e/Adapters/interop-level-zero-image.cpp @@ -116,7 +116,7 @@ int main() { } } // ~image - } // closure + } // closure { // ------ 2D ------ @@ -165,7 +165,7 @@ int main() { } } // ~image - } // closure + } // closure { // ------ 3D ------ @@ -216,7 +216,7 @@ int main() { } } // ~image - } // closure + } // closure #else std::cout << "Missing Level-Zero backend. Test skipped." << std::endl; diff --git a/sycl/test-e2e/Plugin/interop-level-zero-keep-ownership.cpp b/sycl/test-e2e/Adapters/interop-level-zero-keep-ownership.cpp similarity index 100% rename from sycl/test-e2e/Plugin/interop-level-zero-keep-ownership.cpp rename to sycl/test-e2e/Adapters/interop-level-zero-keep-ownership.cpp diff --git a/sycl/test-e2e/Plugin/interop-level-zero-thread.cpp b/sycl/test-e2e/Adapters/interop-level-zero-thread.cpp similarity index 100% rename from sycl/test-e2e/Plugin/interop-level-zero-thread.cpp rename to sycl/test-e2e/Adapters/interop-level-zero-thread.cpp diff --git a/sycl/test-e2e/Plugin/interop-level-zero.cpp b/sycl/test-e2e/Adapters/interop-level-zero.cpp similarity index 100% rename from sycl/test-e2e/Plugin/interop-level-zero.cpp rename to sycl/test-e2e/Adapters/interop-level-zero.cpp diff --git a/sycl/test-e2e/Plugin/interop-opencl-make-kernel-bundle.cpp b/sycl/test-e2e/Adapters/interop-opencl-make-kernel-bundle.cpp similarity index 100% rename from sycl/test-e2e/Plugin/interop-opencl-make-kernel-bundle.cpp rename to sycl/test-e2e/Adapters/interop-opencl-make-kernel-bundle.cpp diff --git a/sycl/test-e2e/Plugin/interop-opencl-make-kernel.cpp b/sycl/test-e2e/Adapters/interop-opencl-make-kernel.cpp similarity index 100% rename from sycl/test-e2e/Plugin/interop-opencl-make-kernel.cpp rename to sycl/test-e2e/Adapters/interop-opencl-make-kernel.cpp diff --git a/sycl/test-e2e/Plugin/interop-opencl.cpp b/sycl/test-e2e/Adapters/interop-opencl.cpp similarity index 100% rename from sycl/test-e2e/Plugin/interop-opencl.cpp rename to sycl/test-e2e/Adapters/interop-opencl.cpp diff --git a/sycl/test-e2e/Plugin/level-zero-event-leak.cpp b/sycl/test-e2e/Adapters/level-zero-event-leak.cpp similarity index 100% rename from sycl/test-e2e/Plugin/level-zero-event-leak.cpp rename to sycl/test-e2e/Adapters/level-zero-event-leak.cpp diff --git a/sycl/test-e2e/Plugin/level-zero-usm-capabilities.cpp b/sycl/test-e2e/Adapters/level-zero-usm-capabilities.cpp similarity index 100% rename from sycl/test-e2e/Plugin/level-zero-usm-capabilities.cpp rename to sycl/test-e2e/Adapters/level-zero-usm-capabilities.cpp diff --git a/sycl/test-e2e/Plugin/level_zero_barrier_optimization.cpp b/sycl/test-e2e/Adapters/level_zero_barrier_optimization.cpp similarity index 100% rename from sycl/test-e2e/Plugin/level_zero_barrier_optimization.cpp rename to sycl/test-e2e/Adapters/level_zero_barrier_optimization.cpp diff --git a/sycl/test-e2e/Plugin/level_zero_batch_barrier.cpp b/sycl/test-e2e/Adapters/level_zero_batch_barrier.cpp similarity index 100% rename from sycl/test-e2e/Plugin/level_zero_batch_barrier.cpp rename to sycl/test-e2e/Adapters/level_zero_batch_barrier.cpp diff --git a/sycl/test-e2e/Plugin/level_zero_batch_event_status.cpp b/sycl/test-e2e/Adapters/level_zero_batch_event_status.cpp similarity index 97% rename from sycl/test-e2e/Plugin/level_zero_batch_event_status.cpp rename to sycl/test-e2e/Adapters/level_zero_batch_event_status.cpp index 57ba0134c88e1..91eefba54793c 100644 --- a/sycl/test-e2e/Plugin/level_zero_batch_event_status.cpp +++ b/sycl/test-e2e/Adapters/level_zero_batch_event_status.cpp @@ -8,7 +8,7 @@ // level_zero_batch_test.cpp // -// This tests the level zero plugin's kernel batching code. It specifically +// This tests the level zero adapter's kernel batching code. It specifically // tests that the current batch is submitted when an Event execution status // request is made. This test uses explicit SYCL_PI_LEVEL_ZERO_BATCH_SIZE=4 // to make sure that the batching is submitted when the urEventGetInfo is diff --git a/sycl/test-e2e/Plugin/level_zero_batch_test.cpp b/sycl/test-e2e/Adapters/level_zero_batch_test.cpp similarity index 99% rename from sycl/test-e2e/Plugin/level_zero_batch_test.cpp rename to sycl/test-e2e/Adapters/level_zero_batch_test.cpp index 792090a35e481..0a45743fd074f 100644 --- a/sycl/test-e2e/Plugin/level_zero_batch_test.cpp +++ b/sycl/test-e2e/Adapters/level_zero_batch_test.cpp @@ -73,7 +73,7 @@ // level_zero_batch_test.cpp // -// This tests the level zero plugin's kernel batching code. The default +// This tests the level zero adapter's kernel batching code. The default // batching is 4, and exact batch size can be controlled with environment // variable SYCL_PI_LEVEL_ZEOR+BATCH_SIZE=N. // This test enqueues 8 kernels and then does a wait. And it does this 3 times. diff --git a/sycl/test-e2e/Plugin/level_zero_batch_test_copy_with_compute.cpp b/sycl/test-e2e/Adapters/level_zero_batch_test_copy_with_compute.cpp similarity index 99% rename from sycl/test-e2e/Plugin/level_zero_batch_test_copy_with_compute.cpp rename to sycl/test-e2e/Adapters/level_zero_batch_test_copy_with_compute.cpp index 2ebe6aa0c748a..6742000dd2f84 100644 --- a/sycl/test-e2e/Plugin/level_zero_batch_test_copy_with_compute.cpp +++ b/sycl/test-e2e/Adapters/level_zero_batch_test_copy_with_compute.cpp @@ -26,7 +26,7 @@ // level_zero_batch_test_copy_with_compute.cpp // -// This tests the level zero plugin's kernel batching code. The default +// This tests the level zero adapter's kernel batching code. The default // batching is 4, and exact batch size can be controlled with environment // variable SYCL_PI_LEVEL_ZERO_{COPY_}BATCH_SIZE=N. // This test enqueues 8 kernels and then does a wait. And it does this 3 times. diff --git a/sycl/test-e2e/Plugin/level_zero_device_free_mem.cpp b/sycl/test-e2e/Adapters/level_zero_device_free_mem.cpp similarity index 97% rename from sycl/test-e2e/Plugin/level_zero_device_free_mem.cpp rename to sycl/test-e2e/Adapters/level_zero_device_free_mem.cpp index 62b32c90544b6..d6301bf8b93af 100644 --- a/sycl/test-e2e/Plugin/level_zero_device_free_mem.cpp +++ b/sycl/test-e2e/Adapters/level_zero_device_free_mem.cpp @@ -1,6 +1,6 @@ // https://github.com/intel/llvm/issues/14244 -// sycl-ls --verbose shows the "ext_intel_free_memory" aspect if -// ZES_ENABLE_SYSMAN=1 is set, but also is sysman init is supported, +// sycl-ls --verbose shows the "ext_intel_free_memory" aspect if +// ZES_ENABLE_SYSMAN=1 is set, but also is sysman init is supported, // so this test is missed if it requires aspect-ext_intel_free_memory. // Since gen9 and get12 don't support this query, diff --git a/sycl/test-e2e/Plugin/level_zero_device_memory_clock_rate_and_bus_width.cpp b/sycl/test-e2e/Adapters/level_zero_device_memory_clock_rate_and_bus_width.cpp similarity index 100% rename from sycl/test-e2e/Plugin/level_zero_device_memory_clock_rate_and_bus_width.cpp rename to sycl/test-e2e/Adapters/level_zero_device_memory_clock_rate_and_bus_width.cpp diff --git a/sycl/test-e2e/Plugin/level_zero_device_scope_events.cpp b/sycl/test-e2e/Adapters/level_zero_device_scope_events.cpp similarity index 100% rename from sycl/test-e2e/Plugin/level_zero_device_scope_events.cpp rename to sycl/test-e2e/Adapters/level_zero_device_scope_events.cpp diff --git a/sycl/test-e2e/Plugin/level_zero_dynamic_batch_test.cpp b/sycl/test-e2e/Adapters/level_zero_dynamic_batch_test.cpp similarity index 99% rename from sycl/test-e2e/Plugin/level_zero_dynamic_batch_test.cpp rename to sycl/test-e2e/Adapters/level_zero_dynamic_batch_test.cpp index fbe7d8e27cb55..106043eddc3bf 100644 --- a/sycl/test-e2e/Plugin/level_zero_dynamic_batch_test.cpp +++ b/sycl/test-e2e/Adapters/level_zero_dynamic_batch_test.cpp @@ -12,7 +12,7 @@ // level_zero_dynamic_batch_test.cpp // -// This tests the level zero plugin's kernel dynamic batch size adjustment +// This tests the level zero adapter's kernel dynamic batch size adjustment // code. // It starts out by enqueing 40 kernels before it does a wait, and it does // this 5 times. That should cause the dynamic batch size adjustment to diff --git a/sycl/test-e2e/Plugin/level_zero_eager_init.cpp b/sycl/test-e2e/Adapters/level_zero_eager_init.cpp similarity index 100% rename from sycl/test-e2e/Plugin/level_zero_eager_init.cpp rename to sycl/test-e2e/Adapters/level_zero_eager_init.cpp diff --git a/sycl/test-e2e/Plugin/level_zero_events_caching.cpp b/sycl/test-e2e/Adapters/level_zero_events_caching.cpp similarity index 95% rename from sycl/test-e2e/Plugin/level_zero_events_caching.cpp rename to sycl/test-e2e/Adapters/level_zero_events_caching.cpp index caa107a2eaf6f..40e0349f5d356 100644 --- a/sycl/test-e2e/Plugin/level_zero_events_caching.cpp +++ b/sycl/test-e2e/Adapters/level_zero_events_caching.cpp @@ -13,7 +13,7 @@ // CACHING-ENABLED: zeEventCreate = {{[1-9]}} // CACHING-DISABLED: zeEventCreate = 256 -// Check event caching modes in the L0 plugin. +// Check event caching modes in the L0 adapter. #include diff --git a/sycl/test-e2e/Plugin/level_zero_events_caching_leak.cpp b/sycl/test-e2e/Adapters/level_zero_events_caching_leak.cpp similarity index 100% rename from sycl/test-e2e/Plugin/level_zero_events_caching_leak.cpp rename to sycl/test-e2e/Adapters/level_zero_events_caching_leak.cpp diff --git a/sycl/test-e2e/Plugin/level_zero_ext_intel_cslice.cpp b/sycl/test-e2e/Adapters/level_zero_ext_intel_cslice.cpp similarity index 100% rename from sycl/test-e2e/Plugin/level_zero_ext_intel_cslice.cpp rename to sycl/test-e2e/Adapters/level_zero_ext_intel_cslice.cpp diff --git a/sycl/test-e2e/Plugin/level_zero_ext_intel_queue_index.cpp b/sycl/test-e2e/Adapters/level_zero_ext_intel_queue_index.cpp similarity index 100% rename from sycl/test-e2e/Plugin/level_zero_ext_intel_queue_index.cpp rename to sycl/test-e2e/Adapters/level_zero_ext_intel_queue_index.cpp diff --git a/sycl/test-e2e/Plugin/level_zero_imm_cmdlist.cpp b/sycl/test-e2e/Adapters/level_zero_imm_cmdlist.cpp similarity index 100% rename from sycl/test-e2e/Plugin/level_zero_imm_cmdlist.cpp rename to sycl/test-e2e/Adapters/level_zero_imm_cmdlist.cpp diff --git a/sycl/test-e2e/Plugin/level_zero_imm_cmdlist_per_thread.cpp b/sycl/test-e2e/Adapters/level_zero_imm_cmdlist_per_thread.cpp similarity index 93% rename from sycl/test-e2e/Plugin/level_zero_imm_cmdlist_per_thread.cpp rename to sycl/test-e2e/Adapters/level_zero_imm_cmdlist_per_thread.cpp index 153af5c5a1f9d..6fe8d3c98ff64 100644 --- a/sycl/test-e2e/Plugin/level_zero_imm_cmdlist_per_thread.cpp +++ b/sycl/test-e2e/Adapters/level_zero_imm_cmdlist_per_thread.cpp @@ -74,10 +74,10 @@ int main() { // Create one queue auto D = Queue.get_device(); const char *devType = D.is_cpu() ? "CPU" : "GPU"; - std::string pluginName = D.get_platform().get_info(); + std::string adapterName = D.get_platform().get_info(); std::cout << "Running on device " << devType << " (" - << D.get_info() << ") " << pluginName - << " plugin\n"; + << D.get_info() << ") " << adapterName + << " adapter\n"; // Use queue in multiple threads std::thread T1(run_sample, Queue, 0); diff --git a/sycl/test-e2e/Plugin/level_zero_inorder.cpp b/sycl/test-e2e/Adapters/level_zero_inorder.cpp similarity index 100% rename from sycl/test-e2e/Plugin/level_zero_inorder.cpp rename to sycl/test-e2e/Adapters/level_zero_inorder.cpp diff --git a/sycl/test-e2e/Plugin/level_zero_inorder_interleaving_kernel_copy.cpp b/sycl/test-e2e/Adapters/level_zero_inorder_interleaving_kernel_copy.cpp similarity index 100% rename from sycl/test-e2e/Plugin/level_zero_inorder_interleaving_kernel_copy.cpp rename to sycl/test-e2e/Adapters/level_zero_inorder_interleaving_kernel_copy.cpp diff --git a/sycl/test-e2e/Plugin/level_zero_interop_memcpy.cpp b/sycl/test-e2e/Adapters/level_zero_interop_memcpy.cpp similarity index 100% rename from sycl/test-e2e/Plugin/level_zero_interop_memcpy.cpp rename to sycl/test-e2e/Adapters/level_zero_interop_memcpy.cpp diff --git a/sycl/test-e2e/Plugin/level_zero_memory_fill.cpp b/sycl/test-e2e/Adapters/level_zero_memory_fill.cpp similarity index 100% rename from sycl/test-e2e/Plugin/level_zero_memory_fill.cpp rename to sycl/test-e2e/Adapters/level_zero_memory_fill.cpp diff --git a/sycl/test-e2e/Plugin/level_zero_queue_create.cpp b/sycl/test-e2e/Adapters/level_zero_queue_create.cpp similarity index 100% rename from sycl/test-e2e/Plugin/level_zero_queue_create.cpp rename to sycl/test-e2e/Adapters/level_zero_queue_create.cpp diff --git a/sycl/test-e2e/Plugin/level_zero_queue_priority.cpp b/sycl/test-e2e/Adapters/level_zero_queue_priority.cpp similarity index 100% rename from sycl/test-e2e/Plugin/level_zero_queue_priority.cpp rename to sycl/test-e2e/Adapters/level_zero_queue_priority.cpp diff --git a/sycl/test-e2e/Plugin/level_zero_queue_profiling.cpp b/sycl/test-e2e/Adapters/level_zero_queue_profiling.cpp similarity index 100% rename from sycl/test-e2e/Plugin/level_zero_queue_profiling.cpp rename to sycl/test-e2e/Adapters/level_zero_queue_profiling.cpp diff --git a/sycl/test-e2e/Plugin/level_zero_sub_sub_device.cpp b/sycl/test-e2e/Adapters/level_zero_sub_sub_device.cpp similarity index 100% rename from sycl/test-e2e/Plugin/level_zero_sub_sub_device.cpp rename to sycl/test-e2e/Adapters/level_zero_sub_sub_device.cpp diff --git a/sycl/test-e2e/Plugin/level_zero_track_indirect_access_memory.cpp b/sycl/test-e2e/Adapters/level_zero_track_indirect_access_memory.cpp similarity index 97% rename from sycl/test-e2e/Plugin/level_zero_track_indirect_access_memory.cpp rename to sycl/test-e2e/Adapters/level_zero_track_indirect_access_memory.cpp index 40cfee5b66c0e..12ac686b2f4db 100644 --- a/sycl/test-e2e/Plugin/level_zero_track_indirect_access_memory.cpp +++ b/sycl/test-e2e/Adapters/level_zero_track_indirect_access_memory.cpp @@ -9,7 +9,7 @@ // Tracking and deferred release is necessary for memory which can be indirectly // accessed because such memory can't be released as soon as someone calls free. // It can be released only after completion of all kernels which can possibly -// access this memory indirectly. Currently the Level Zero plugin marks all +// access this memory indirectly. Currently the Level Zero adapter marks all // kernels with indirect access flag conservatively. This flag means that kernel // starts to reference all existing memory allocations (even if not explicitly // used in the kernel) as soon as it is submitted. That's why basically all diff --git a/sycl/test-e2e/Plugin/level_zero_usm_device_read_only.cpp b/sycl/test-e2e/Adapters/level_zero_usm_device_read_only.cpp similarity index 100% rename from sycl/test-e2e/Plugin/level_zero_usm_device_read_only.cpp rename to sycl/test-e2e/Adapters/level_zero_usm_device_read_only.cpp diff --git a/sycl/test-e2e/Plugin/level_zero_usm_residency.cpp b/sycl/test-e2e/Adapters/level_zero_usm_residency.cpp similarity index 100% rename from sycl/test-e2e/Plugin/level_zero_usm_residency.cpp rename to sycl/test-e2e/Adapters/level_zero_usm_residency.cpp diff --git a/sycl/test-e2e/Plugin/level_zero_uuid.cpp b/sycl/test-e2e/Adapters/level_zero_uuid.cpp similarity index 100% rename from sycl/test-e2e/Plugin/level_zero_uuid.cpp rename to sycl/test-e2e/Adapters/level_zero_uuid.cpp diff --git a/sycl/test-e2e/Plugin/max_malloc.cpp b/sycl/test-e2e/Adapters/max_malloc.cpp similarity index 100% rename from sycl/test-e2e/Plugin/max_malloc.cpp rename to sycl/test-e2e/Adapters/max_malloc.cpp diff --git a/sycl/test-e2e/Plugin/queue_submit_mode.cpp b/sycl/test-e2e/Adapters/queue_submit_mode.cpp similarity index 100% rename from sycl/test-e2e/Plugin/queue_submit_mode.cpp rename to sycl/test-e2e/Adapters/queue_submit_mode.cpp diff --git a/sycl/test-e2e/Plugin/retain_events.cpp b/sycl/test-e2e/Adapters/retain_events.cpp similarity index 100% rename from sycl/test-e2e/Plugin/retain_events.cpp rename to sycl/test-e2e/Adapters/retain_events.cpp diff --git a/sycl/test-e2e/Plugin/sycl-ls-gpu-cuda.cpp b/sycl/test-e2e/Adapters/sycl-ls-gpu-cuda.cpp similarity index 100% rename from sycl/test-e2e/Plugin/sycl-ls-gpu-cuda.cpp rename to sycl/test-e2e/Adapters/sycl-ls-gpu-cuda.cpp diff --git a/sycl/test-e2e/Plugin/sycl-ls-gpu-default-any.cpp b/sycl/test-e2e/Adapters/sycl-ls-gpu-default-any.cpp similarity index 100% rename from sycl/test-e2e/Plugin/sycl-ls-gpu-default-any.cpp rename to sycl/test-e2e/Adapters/sycl-ls-gpu-default-any.cpp diff --git a/sycl/test-e2e/Plugin/sycl-ls-gpu-default-level-zero.cpp b/sycl/test-e2e/Adapters/sycl-ls-gpu-default-level-zero.cpp similarity index 100% rename from sycl/test-e2e/Plugin/sycl-ls-gpu-default-level-zero.cpp rename to sycl/test-e2e/Adapters/sycl-ls-gpu-default-level-zero.cpp diff --git a/sycl/test-e2e/Plugin/sycl-ls-gpu-hip.cpp b/sycl/test-e2e/Adapters/sycl-ls-gpu-hip.cpp similarity index 100% rename from sycl/test-e2e/Plugin/sycl-ls-gpu-hip.cpp rename to sycl/test-e2e/Adapters/sycl-ls-gpu-hip.cpp diff --git a/sycl/test-e2e/Plugin/sycl-ls-gpu-level-zero.cpp b/sycl/test-e2e/Adapters/sycl-ls-gpu-level-zero.cpp similarity index 100% rename from sycl/test-e2e/Plugin/sycl-ls-gpu-level-zero.cpp rename to sycl/test-e2e/Adapters/sycl-ls-gpu-level-zero.cpp diff --git a/sycl/test-e2e/Plugin/sycl-ls-gpu-opencl.cpp b/sycl/test-e2e/Adapters/sycl-ls-gpu-opencl.cpp similarity index 100% rename from sycl/test-e2e/Plugin/sycl-ls-gpu-opencl.cpp rename to sycl/test-e2e/Adapters/sycl-ls-gpu-opencl.cpp diff --git a/sycl/test-e2e/Plugin/sycl-ls-gpu-sycl-be.cpp b/sycl/test-e2e/Adapters/sycl-ls-gpu-sycl-be.cpp similarity index 100% rename from sycl/test-e2e/Plugin/sycl-ls-gpu-sycl-be.cpp rename to sycl/test-e2e/Adapters/sycl-ls-gpu-sycl-be.cpp diff --git a/sycl/test-e2e/Plugin/sycl-ls-uuid-subdevs.cpp b/sycl/test-e2e/Adapters/sycl-ls-uuid-subdevs.cpp similarity index 100% rename from sycl/test-e2e/Plugin/sycl-ls-uuid-subdevs.cpp rename to sycl/test-e2e/Adapters/sycl-ls-uuid-subdevs.cpp diff --git a/sycl/test-e2e/Plugin/sycl-ls.cpp b/sycl/test-e2e/Adapters/sycl-ls.cpp similarity index 100% rename from sycl/test-e2e/Plugin/sycl-ls.cpp rename to sycl/test-e2e/Adapters/sycl-ls.cpp diff --git a/sycl/test-e2e/Plugin/sycl-partition-info.cpp b/sycl/test-e2e/Adapters/sycl-partition-info.cpp similarity index 89% rename from sycl/test-e2e/Plugin/sycl-partition-info.cpp rename to sycl/test-e2e/Adapters/sycl-partition-info.cpp index e6f911f541d3e..8274c2bb0459f 100644 --- a/sycl/test-e2e/Plugin/sycl-partition-info.cpp +++ b/sycl/test-e2e/Adapters/sycl-partition-info.cpp @@ -1,7 +1,7 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// various plugins may return a larger choice of partition properties than SYCL +// various adapters may return a larger choice of partition properties than SYCL // supports ensure we are only returning SYCL standard partition properties. #include diff --git a/sycl/test-e2e/Plugin/sycl-targets-order.cpp b/sycl/test-e2e/Adapters/sycl-targets-order.cpp similarity index 100% rename from sycl/test-e2e/Plugin/sycl-targets-order.cpp rename to sycl/test-e2e/Adapters/sycl-targets-order.cpp diff --git a/sycl/test-e2e/AddressSanitizer/common/ignorelist.txt b/sycl/test-e2e/AddressSanitizer/common/ignorelist.txt new file mode 100644 index 0000000000000..7251d5e24767d --- /dev/null +++ b/sycl/test-e2e/AddressSanitizer/common/ignorelist.txt @@ -0,0 +1 @@ +fun:*MyKernel* diff --git a/sycl/test-e2e/AddressSanitizer/common/kernel-filter.cpp b/sycl/test-e2e/AddressSanitizer/common/kernel-filter.cpp new file mode 100644 index 0000000000000..2830241eda489 --- /dev/null +++ b/sycl/test-e2e/AddressSanitizer/common/kernel-filter.cpp @@ -0,0 +1,27 @@ +// REQUIRES: linux +// RUN: %{build} %device_asan_flags -O2 -fsanitize-ignorelist=%p/ignorelist.txt -o %t +// RUN: %{run} %t 2>&1 | FileCheck %s +// RUN: %{build} %device_asan_flags %if cpu %{ -fsycl-targets=spir64_x86_64 %} %if gpu %{ -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen %gpu_aot_target_opts %} -O2 -fsanitize-ignorelist=%p/ignorelist.txt -o %t2 +// RUN: %{run} %t2 2>&1 | FileCheck %s + +#include +#include + +int main() { + sycl::queue Q; + constexpr std::size_t N = 16; + auto *array = sycl::malloc_device(N, Q); + + Q.submit([&](sycl::handler &h) { + h.parallel_for( + sycl::nd_range<1>(N + 1, 1), + [=](sycl::nd_item<1> item) { ++array[item.get_global_id(0)]; }); + }); + Q.wait(); + + sycl::free(array, Q); + std::cout << "PASS" << std::endl; + return 0; +} + +// CHECK: PASS diff --git a/sycl/test-e2e/Assert/assert_in_simultaneous_kernels.cpp b/sycl/test-e2e/Assert/assert_in_simultaneous_kernels.cpp index 69b6197921322..15854048d275b 100644 --- a/sycl/test-e2e/Assert/assert_in_simultaneous_kernels.cpp +++ b/sycl/test-e2e/Assert/assert_in_simultaneous_kernels.cpp @@ -7,7 +7,7 @@ // XFAIL-TRACKER: https://github.com/intel/llvm/issues/11364 // // Since this is a multi-threaded application enable memory tracking and -// deferred release feature in the Level Zero plugin to avoid releasing memory +// deferred release feature in the Level Zero adapter to avoid releasing memory // too early. This is necessary because currently SYCL RT sets indirect access // flag for all kernels and the Level Zero runtime doesn't support deferred // release yet. diff --git a/sycl/test-e2e/Assert/assert_in_simultaneous_kernels_win.cpp b/sycl/test-e2e/Assert/assert_in_simultaneous_kernels_win.cpp index f3b9bd9fc645f..8ca6db17ae348 100644 --- a/sycl/test-e2e/Assert/assert_in_simultaneous_kernels_win.cpp +++ b/sycl/test-e2e/Assert/assert_in_simultaneous_kernels_win.cpp @@ -2,7 +2,7 @@ // RUN: %{build} -DSYCL_FALLBACK_ASSERT=1 -o %t.out %threads_lib // // Since this is a multi-threaded application enable memory tracking and -// deferred release feature in the Level Zero plugin to avoid releasing memory +// deferred release feature in the Level Zero adapter to avoid releasing memory // too early. This is necessary because currently SYCL RT sets indirect access // flag for all kernels and the Level Zero runtime doesn't support deferred // release yet. diff --git a/sycl/test-e2e/Assert/assert_in_simultaneously_multiple_tus.cpp b/sycl/test-e2e/Assert/assert_in_simultaneously_multiple_tus.cpp index 0ee0b2f7a4ad0..d43c806952990 100644 --- a/sycl/test-e2e/Assert/assert_in_simultaneously_multiple_tus.cpp +++ b/sycl/test-e2e/Assert/assert_in_simultaneously_multiple_tus.cpp @@ -7,7 +7,7 @@ // RUN: %{build} -DSYCL_FALLBACK_ASSERT=1 -I %S/Inputs %S/Inputs/kernels_in_file2.cpp -o %t.out %threads_lib // // Since this is a multi-threaded application enable memory tracking and -// deferred release feature in the Level Zero plugin to avoid releasing memory +// deferred release feature in the Level Zero adapter to avoid releasing memory // too early. This is necessary because currently SYCL RT sets indirect access // flag for all kernels and the Level Zero runtime doesn't support deferred // release yet. diff --git a/sycl/test-e2e/Assert/assert_in_simultaneously_multiple_tus_one_ndebug.cpp b/sycl/test-e2e/Assert/assert_in_simultaneously_multiple_tus_one_ndebug.cpp index 6cc3b6c85d283..731187fe4cdf7 100644 --- a/sycl/test-e2e/Assert/assert_in_simultaneously_multiple_tus_one_ndebug.cpp +++ b/sycl/test-e2e/Assert/assert_in_simultaneously_multiple_tus_one_ndebug.cpp @@ -9,7 +9,7 @@ // RUN: %if cpu %{ %{run} %t.out &> %t.cpu.txt ; FileCheck %s --input-file %t.cpu.txt %} // // Since this is a multi-threaded application enable memory tracking and -// deferred release feature in the Level Zero plugin to avoid releasing memory +// deferred release feature in the Level Zero adapter to avoid releasing memory // too early. This is necessary because currently SYCL RT sets indirect access // flag for all kernels and the Level Zero runtime doesn't support deferred // release yet. diff --git a/sycl/test-e2e/Basic/built-ins/marray_common.cpp b/sycl/test-e2e/Basic/built-ins/marray_common.cpp index 921e5fed0f6b4..66f0c02b01427 100644 --- a/sycl/test-e2e/Basic/built-ins/marray_common.cpp +++ b/sycl/test-e2e/Basic/built-ins/marray_common.cpp @@ -2,7 +2,7 @@ // RUN: %{build} %{mathflags} -o %t.out // RUN: %{run} %t.out -// RUN: %if preview-breaking-changes-supported %{ %{build} -fpreview-breaking-changes -o %t_preview.out %} +// RUN: %if preview-breaking-changes-supported %{ %{build} %{mathflags} -fpreview-breaking-changes -o %t_preview.out %} // RUN: %if preview-breaking-changes-supported %{ %{run} %t_preview.out%} #ifdef _WIN32 diff --git a/sycl/test-e2e/Basic/empty_command.cpp b/sycl/test-e2e/Basic/empty_command.cpp index 313ca81743c36..dac5865ae8d72 100644 --- a/sycl/test-e2e/Basic/empty_command.cpp +++ b/sycl/test-e2e/Basic/empty_command.cpp @@ -26,18 +26,11 @@ void test_host_task_dep() { auto empty_cg_event = q.submit([&](handler &cgh) { cgh.depends_on(host_event); }); - // FIXME: This should deadlock, but the dependency is ignored currently. - empty_cg_event.wait(); - assert(x == 0); start_execution.count_down(); empty_cg_event.wait(); - // FIXME: uncomment once the bug mentioned above is fixed. - // assert(x == 42); - - // I'm seeing some weird hang without this: - host_event.wait(); + assert(x == 42); } void test_device_event_dep() { @@ -53,17 +46,12 @@ void test_device_event_dep() { auto empty_cg_event = q.submit([&](handler &cgh) { cgh.depends_on(device_event); }); - // FIXME: This should deadlock, but the dependency is ignored currently. - empty_cg_event.wait(); - assert(*p == 0); start_execution.count_down(); empty_cg_event.wait(); - // FIXME: uncomment once the bug mentioned above is fixed. - // assert(*p == 42); + assert(*p == 42); - q.wait(); sycl::free(p, q); } @@ -90,17 +78,12 @@ void test_accessor_dep() { auto empty_cg_event = q.submit([&](handler &cgh) { sycl::accessor a{b, cgh}; }); - // FIXME: This should deadlock, but the dependency is ignored currently. - empty_cg_event.wait(); - assert(*p == 0); start_execution.count_down(); empty_cg_event.wait(); - // FIXME: uncomment once the bug mentioned above is fixed. - // assert(*p == 42); + assert(*p == 42); - q.wait(); sycl::free(p, q); } diff --git a/sycl/test-e2e/Basic/image/srgba-aspect-check.cpp b/sycl/test-e2e/Basic/image/srgba-aspect-check.cpp index 5c892d716fcc5..a93cae1e64d5a 100644 --- a/sycl/test-e2e/Basic/image/srgba-aspect-check.cpp +++ b/sycl/test-e2e/Basic/image/srgba-aspect-check.cpp @@ -5,7 +5,7 @@ int main() { sycl::queue q; - // Ensure that plugins that don't implement the support can still handle the + // Ensure that adapters that don't implement the support can still handle the // aspect query. std::ignore = q.get_device().has(sycl::aspect::ext_oneapi_srgb); return 0; diff --git a/sycl/test-e2e/Basic/info.cpp b/sycl/test-e2e/Basic/info.cpp index 3fd83be26e994..4a43817646340 100644 --- a/sycl/test-e2e/Basic/info.cpp +++ b/sycl/test-e2e/Basic/info.cpp @@ -208,6 +208,7 @@ int main() { std::string separator(std::string(80, '-') + "\n"); std::cout << separator << "Device information\n" << separator; device dev(default_selector_v); + backend backend{dev.get_backend()}; print_info(dev, "Device type"); print_info(dev, "Vendor ID"); @@ -322,7 +323,14 @@ int main() { print_info(dev, "Name"); print_info(dev, "Vendor"); print_info(dev, "Driver version"); - print_info(dev, "Profile"); + try { + print_info(dev, "Profile"); + assert(backend == sycl::backend::opencl && + "An exception is expected for non OpenCL backend"); + } catch (const sycl::exception &e) { + assert(e.code() == sycl::errc::invalid && + backend != sycl::backend::opencl && "Unexpected exception"); + } print_info(dev, "Version"); print_info(dev, "Backend version"); @@ -332,11 +340,18 @@ int main() { "Extensions"); print_info(dev, "Printf buffer size"); - print_info( - dev, "Preferred interop user sync"); + try { + print_info( + dev, "Preferred interop user sync"); + assert(backend == sycl::backend::opencl && + "An exception is expected for non OpenCL backend"); + } catch (const sycl::exception &e) { + assert(e.code() == sycl::errc::invalid && + backend != sycl::backend::opencl && "Unexpected exception"); + } try { print_info(dev, "Parent device"); - } catch (sycl::exception e) { + } catch (const sycl::exception &e) { std::cout << "Expected exception has been caught: " << e.what() << std::endl; } diff --git a/sycl/test-e2e/Basic/queue/release.cpp b/sycl/test-e2e/Basic/queue/release.cpp index aef5160123ce4..d241b742a0216 100644 --- a/sycl/test-e2e/Basic/queue/release.cpp +++ b/sycl/test-e2e/Basic/queue/release.cpp @@ -14,7 +14,7 @@ int main() { } // CHECK: <--- urEnqueueKernelLaunch( -// FIXME the order of these 2 varies between plugins due to a Level Zero +// FIXME the order of these 2 varies between adapters due to a Level Zero // specific queue workaround. // CHECK-DAG: <--- urEventRelease( // CHECK-DAG: <--- urQueueRelease( diff --git a/sycl/test-e2e/ESIMD/InlineAsm/asm_glb.cpp b/sycl/test-e2e/ESIMD/InlineAsm/asm_glb.cpp index 03fa6a39cea13..88285426ef900 100644 --- a/sycl/test-e2e/ESIMD/InlineAsm/asm_glb.cpp +++ b/sycl/test-e2e/ESIMD/InlineAsm/asm_glb.cpp @@ -43,7 +43,8 @@ int main(void) { queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() << "\n"; + std::cout << "Running on " << dev.get_info() + << "\n"; auto e = q.submit([&](handler &cgh) { auto PA = bufa.get_access(cgh); diff --git a/sycl/test-e2e/ESIMD/addc.cpp b/sycl/test-e2e/ESIMD/addc.cpp index 30a73d900ea3f..956b860e97b53 100644 --- a/sycl/test-e2e/ESIMD/addc.cpp +++ b/sycl/test-e2e/ESIMD/addc.cpp @@ -138,7 +138,7 @@ template bool test(sycl::queue Q) { int main() { queue Q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); auto D = Q.get_device(); - std::cout << "Running on " << D.get_info() << "\n"; + std::cout << "Running on " << D.get_info() << "\n"; constexpr bool AIsVector = true; constexpr bool BIsVector = true; diff --git a/sycl/test-e2e/ESIMD/api/functional/operators/operator_assignment_glb.cpp b/sycl/test-e2e/ESIMD/api/functional/operators/operator_assignment_glb.cpp index 999ab2f91bfec..11a3a02aede09 100644 --- a/sycl/test-e2e/ESIMD/api/functional/operators/operator_assignment_glb.cpp +++ b/sycl/test-e2e/ESIMD/api/functional/operators/operator_assignment_glb.cpp @@ -46,7 +46,8 @@ int main(void) { queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() << "\n"; + std::cout << "Running on " << dev.get_info() + << "\n"; auto e = q.submit([&](handler &cgh) { auto PA = bufa.get_access(cgh); diff --git a/sycl/test-e2e/ESIMD/api/functional/operators/operator_assignment_glb_mask.cpp b/sycl/test-e2e/ESIMD/api/functional/operators/operator_assignment_glb_mask.cpp index 8b2b46e046ee7..4537ddd210cb9 100644 --- a/sycl/test-e2e/ESIMD/api/functional/operators/operator_assignment_glb_mask.cpp +++ b/sycl/test-e2e/ESIMD/api/functional/operators/operator_assignment_glb_mask.cpp @@ -44,7 +44,8 @@ int main(void) { queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() << "\n"; + std::cout << "Running on " << dev.get_info() + << "\n"; auto e = q.submit([&](handler &cgh) { auto PA = bufa.get_access(cgh); diff --git a/sycl/test-e2e/ESIMD/dpas/dpas_bf16.cpp b/sycl/test-e2e/ESIMD/dpas/dpas_bf16.cpp index 876fe7c9466cf..578cca3eff771 100644 --- a/sycl/test-e2e/ESIMD/dpas/dpas_bf16.cpp +++ b/sycl/test-e2e/ESIMD/dpas/dpas_bf16.cpp @@ -16,7 +16,8 @@ int main(int argc, const char *argv[]) { queue Q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); auto Dev = Q.get_device(); - std::cout << "Running on " << Dev.get_info() << std::endl; + std::cout << "Running on " << Dev.get_info() + << std::endl; bool Print = argc > 1 && std::string(argv[1]) == "-debug"; bool Passed = true; diff --git a/sycl/test-e2e/ESIMD/dpas/dpas_fp16.cpp b/sycl/test-e2e/ESIMD/dpas/dpas_fp16.cpp index 278f8397018d3..601dc107ed9f8 100644 --- a/sycl/test-e2e/ESIMD/dpas/dpas_fp16.cpp +++ b/sycl/test-e2e/ESIMD/dpas/dpas_fp16.cpp @@ -16,7 +16,8 @@ int main(int argc, const char *argv[]) { queue Q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); auto Dev = Q.get_device(); - std::cout << "Running on " << Dev.get_info() << std::endl; + std::cout << "Running on " << Dev.get_info() + << std::endl; bool Print = argc > 1 && std::string(argv[1]) == "-debug"; bool Passed = true; diff --git a/sycl/test-e2e/ESIMD/dpas/dpas_int.cpp b/sycl/test-e2e/ESIMD/dpas/dpas_int.cpp index 80cd9a0f4be5f..6156456d1b387 100644 --- a/sycl/test-e2e/ESIMD/dpas/dpas_int.cpp +++ b/sycl/test-e2e/ESIMD/dpas/dpas_int.cpp @@ -16,7 +16,8 @@ int main(int argc, const char *argv[]) { queue Q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); auto Dev = Q.get_device(); - std::cout << "Running on " << Dev.get_info() << std::endl; + std::cout << "Running on " << Dev.get_info() + << std::endl; bool Print = argc > 1 && std::string(argv[1]) == "-debug"; bool Passed = true; diff --git a/sycl/test-e2e/ESIMD/imulh_umulh.cpp b/sycl/test-e2e/ESIMD/imulh_umulh.cpp index 9fba387948f07..fe74bf72d80a9 100644 --- a/sycl/test-e2e/ESIMD/imulh_umulh.cpp +++ b/sycl/test-e2e/ESIMD/imulh_umulh.cpp @@ -155,7 +155,7 @@ template bool tests(sycl::queue Q) { int main() { queue Q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); auto D = Q.get_device(); - std::cout << "Running on " << D.get_info() << "\n"; + std::cout << "Running on " << D.get_info() << "\n"; constexpr bool AIsVector = true; constexpr bool BIsVector = true; diff --git a/sycl/test-e2e/ESIMD/local_accessor_copy_to_from.cpp b/sycl/test-e2e/ESIMD/local_accessor_copy_to_from.cpp index 59c50e0547bfb..52318cb483688 100644 --- a/sycl/test-e2e/ESIMD/local_accessor_copy_to_from.cpp +++ b/sycl/test-e2e/ESIMD/local_accessor_copy_to_from.cpp @@ -96,7 +96,8 @@ int main() { queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() << "\n"; + std::cout << "Running on " << dev.get_info() + << "\n"; bool passed = true; passed &= test(q); diff --git a/sycl/test-e2e/ESIMD/lsc/atomic_smoke.cpp b/sycl/test-e2e/ESIMD/lsc/atomic_smoke.cpp index 6fcbcef600557..78f40e4377be3 100644 --- a/sycl/test-e2e/ESIMD/lsc/atomic_smoke.cpp +++ b/sycl/test-e2e/ESIMD/lsc/atomic_smoke.cpp @@ -848,7 +848,8 @@ int main(void) { queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() << "\n"; + std::cout << "Running on " << dev.get_info() + << "\n"; Config cfg{ 11, // int threads_per_group; diff --git a/sycl/test-e2e/ESIMD/lsc/local_accessor_atomic_smoke.cpp b/sycl/test-e2e/ESIMD/lsc/local_accessor_atomic_smoke.cpp index bb54ca891b4c7..e2eb04d883db3 100644 --- a/sycl/test-e2e/ESIMD/lsc/local_accessor_atomic_smoke.cpp +++ b/sycl/test-e2e/ESIMD/lsc/local_accessor_atomic_smoke.cpp @@ -627,7 +627,8 @@ int main(void) { queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() << "\n"; + std::cout << "Running on " << dev.get_info() + << "\n"; bool passed = true; #ifndef CMPXCHG_TEST diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_argument_type_deduction.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_argument_type_deduction.cpp index 8067859151762..c289b7a61da92 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_argument_type_deduction.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_argument_type_deduction.cpp @@ -102,7 +102,7 @@ int main() { queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); auto device = q.get_device(); - std::cout << "Device name: " << device.get_info() + std::cout << "Device name: " << device.get_info() << std::endl; int error = testUSM<8>(q); diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_predicate.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_predicate.cpp index f1e124dcf8a7e..af91f470002a1 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_predicate.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_predicate.cpp @@ -142,7 +142,7 @@ int main() { auto q = queue{esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()}; auto device = q.get_device(); - std::cout << "Device name: " << device.get_info() + std::cout << "Device name: " << device.get_info() << std::endl; int error = testUSM<8>(q); diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_predicate_stateless.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_predicate_stateless.cpp index 2a978721e070b..eba3f4cba5663 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_predicate_stateless.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_predicate_stateless.cpp @@ -89,7 +89,7 @@ int main() { auto q = queue{esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()}; auto device = q.get_device(); - std::cout << "Device name: " << device.get_info() + std::cout << "Device name: " << device.get_info() << std::endl; int error = testAccessor<8>(q); diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp index 85b052eaf5f3a..93db1fef805ac 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp @@ -473,7 +473,8 @@ int main(void) { queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() << "\n"; + std::cout << "Running on " << dev.get_info() + << "\n"; bool passed = true; #ifndef CMPXCHG_TEST diff --git a/sycl/test-e2e/ESIMD/private_memory/private_memory.cpp b/sycl/test-e2e/ESIMD/private_memory/private_memory.cpp index d98490c871eef..c3b9282d9bdf0 100644 --- a/sycl/test-e2e/ESIMD/private_memory/private_memory.cpp +++ b/sycl/test-e2e/ESIMD/private_memory/private_memory.cpp @@ -149,8 +149,8 @@ template bool tests(queue Q) { int main() { queue Q; - std::cout << "Running on " << Q.get_device().get_info() - << "\n"; + std::cout << "Running on " + << Q.get_device().get_info() << "\n"; bool Passed = true; Passed &= tests(Q); diff --git a/sycl/test-e2e/ESIMD/radix_sort.cpp b/sycl/test-e2e/ESIMD/radix_sort.cpp index ca62ff61a88b8..bfc3dfa53ae00 100644 --- a/sycl/test-e2e/ESIMD/radix_sort.cpp +++ b/sycl/test-e2e/ESIMD/radix_sort.cpp @@ -588,7 +588,8 @@ int main(int argc, char *argv[]) { property::queue::in_order()); auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() << "\n"; + std::cout << "Running on " << dev.get_info() + << "\n"; auto ctxt = q.get_context(); // allocate and initialized input diff --git a/sycl/test-e2e/ESIMD/slm_alloc.cpp b/sycl/test-e2e/ESIMD/slm_alloc.cpp index ef5d191ad56d7..e0c4f8c102151 100644 --- a/sycl/test-e2e/ESIMD/slm_alloc.cpp +++ b/sycl/test-e2e/ESIMD/slm_alloc.cpp @@ -92,7 +92,8 @@ __attribute__((noinline)) int main(void) { queue q; auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() << "\n"; + std::cout << "Running on " << dev.get_info() + << "\n"; std::cout << "force_inline=" << force_inline << "\n"; auto ctxt = q.get_context(); uint32_t size = SLM_TOTAL * NUM_WGS / ELEM_SIZE; diff --git a/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_many_funcs.cpp b/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_many_funcs.cpp index c58423d747ecd..a00d77e338ea7 100644 --- a/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_many_funcs.cpp +++ b/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_many_funcs.cpp @@ -111,7 +111,8 @@ INLINE_CTL void foo(int local_id, T *out, unsigned base) { int main(void) { queue q; auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() << "\n"; + std::cout << "Running on " << dev.get_info() + << "\n"; std::cout << "force_inline=" << force_inline << "\n"; auto ctxt = q.get_context(); diff --git a/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_one_func.cpp b/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_one_func.cpp index 818990eb2d13f..3a9dea2949c8e 100644 --- a/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_one_func.cpp +++ b/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_one_func.cpp @@ -59,7 +59,8 @@ __attribute__((noinline)) int main(void) { queue q; auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() << "\n"; + std::cout << "Running on " << dev.get_info() + << "\n"; std::cout << "force_inline=" << force_inline << "\n"; auto ctxt = q.get_context(); diff --git a/sycl/test-e2e/ESIMD/subb.cpp b/sycl/test-e2e/ESIMD/subb.cpp index d577be1662860..0174501b6cf02 100644 --- a/sycl/test-e2e/ESIMD/subb.cpp +++ b/sycl/test-e2e/ESIMD/subb.cpp @@ -140,7 +140,7 @@ template bool test(sycl::queue Q) { int main() { queue Q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); auto D = Q.get_device(); - std::cout << "Running on " << D.get_info() << "\n"; + std::cout << "Running on " << D.get_info() << "\n"; constexpr bool AIsVector = true; constexpr bool BIsVector = true; diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_usm.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_usm.cpp index 430b834efe8a7..52aa751c887e7 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_usm.cpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_usm.cpp @@ -14,7 +14,8 @@ int main(void) { queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() << "\n"; + std::cout << "Running on " << dev.get_info() + << "\n"; bool passed = true; diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_usm_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_usm_dg2_pvc.cpp index cc38560f5de57..281727c610184 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_usm_dg2_pvc.cpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_usm_dg2_pvc.cpp @@ -17,7 +17,8 @@ int main(void) { queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() << "\n"; + std::cout << "Running on " << dev.get_info() + << "\n"; bool passed = true; diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/device_info_descriptors.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/device_info_descriptors.cpp new file mode 100644 index 0000000000000..f7e85a9407693 --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/device_info_descriptors.cpp @@ -0,0 +1,39 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Test has_2d_block_io_supported device descriptor for some known +// architectures. + +#include +#include + +namespace syclex = sycl::ext::oneapi::experimental; + +int main() { + sycl::queue Q; + auto Arch = Q.get_device().get_info(); + bool Has2DBlockIOSupport = + Q.get_device() + .get_info< + sycl::ext::intel::esimd::info::device::has_2d_block_io_support>(); + if (Arch == syclex::architecture::intel_gpu_pvc) { + if (!Has2DBlockIOSupport) { + std::cerr << "Error: has_2d_block_io_support is expected to be true for " + "PVC architecture" + << std::endl; + return 1; + } + } + if (Arch == syclex::architecture::intel_gpu_tgllp || + Arch == syclex::architecture::intel_gpu_dg2_g10 || + Arch == syclex::architecture::intel_gpu_dg2_g11 || + Arch == syclex::architecture::intel_gpu_dg2_g12) { + if (Has2DBlockIOSupport) { + std::cerr << "Error: has_2d_block_io_support is expected to be false for " + "Tiger Lake and DG2" + << std::endl; + return 1; + } + } + return 0; +} diff --git a/sycl/test-e2e/ESIMD/vadd_raw_send_gen12.cpp b/sycl/test-e2e/ESIMD/vadd_raw_send_gen12.cpp index 7e5d8fff00e8e..d46907dae5bdc 100644 --- a/sycl/test-e2e/ESIMD/vadd_raw_send_gen12.cpp +++ b/sycl/test-e2e/ESIMD/vadd_raw_send_gen12.cpp @@ -35,7 +35,8 @@ int main(void) { queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() << "\n"; + std::cout << "Running on " << dev.get_info() + << "\n"; int *A = malloc_shared(Size, q); int *B = malloc_shared(Size, q); diff --git a/sycl/test-e2e/ESIMD/wait.cpp b/sycl/test-e2e/ESIMD/wait.cpp index 0f37ab61caae0..df0c2dfb955c9 100644 --- a/sycl/test-e2e/ESIMD/wait.cpp +++ b/sycl/test-e2e/ESIMD/wait.cpp @@ -57,7 +57,8 @@ bool test(sycl::queue Q, int IArg = 128) { int main() { queue Q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); auto Dev = Q.get_device(); - std::cout << "Running on " << Dev.get_info() << std::endl; + std::cout << "Running on " << Dev.get_info() + << std::endl; bool Passed = true; Passed &= test(Q); diff --git a/sycl/test-e2e/Graph/Explicit/free_function_kernels.cpp b/sycl/test-e2e/Graph/Explicit/free_function_kernels.cpp new file mode 100644 index 0000000000000..3784ebbd800a6 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/free_function_kernels.cpp @@ -0,0 +1,13 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/free_function_kernels.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/interop-level-zero-get-native-mem.cpp b/sycl/test-e2e/Graph/Explicit/interop-level-zero-get-native-mem.cpp index 1d8fcdf963ab5..adf4b8bc06b2b 100644 --- a/sycl/test-e2e/Graph/Explicit/interop-level-zero-get-native-mem.cpp +++ b/sycl/test-e2e/Graph/Explicit/interop-level-zero-get-native-mem.cpp @@ -1,5 +1,5 @@ // REQUIRES: level_zero, level_zero_dev_kit -// L0 plugin incorrectly reports memory leaks because it doesn't take into +// L0 adapter incorrectly reports memory leaks because it doesn't take into // account direct calls to L0 API. // UNSUPPORTED: ze_debug // RUN: %{build} %level_zero_options -o %t.out diff --git a/sycl/test-e2e/Graph/Explicit/interop-level-zero-launch-kernel.cpp b/sycl/test-e2e/Graph/Explicit/interop-level-zero-launch-kernel.cpp index b283697720201..ce1aac6c121dd 100644 --- a/sycl/test-e2e/Graph/Explicit/interop-level-zero-launch-kernel.cpp +++ b/sycl/test-e2e/Graph/Explicit/interop-level-zero-launch-kernel.cpp @@ -1,5 +1,5 @@ // REQUIRES: level_zero, level_zero_dev_kit -// L0 plugin incorrectly reports memory leaks because it doesn't take into +// L0 adapter incorrectly reports memory leaks because it doesn't take into // account direct calls to the L0 API. // UNSUPPORTED: ze_debug // RUN: %{build} %level_zero_options -o %t.out diff --git a/sycl/test-e2e/Graph/Inputs/Kernels/dyn_cgf_accessor.spv b/sycl/test-e2e/Graph/Inputs/Kernels/dyn_cgf_accessor.spv new file mode 100644 index 0000000000000..e683e8f9c5364 Binary files /dev/null and b/sycl/test-e2e/Graph/Inputs/Kernels/dyn_cgf_accessor.spv differ diff --git a/sycl/test-e2e/Graph/Inputs/free_function_kernels.cpp b/sycl/test-e2e/Graph/Inputs/free_function_kernels.cpp new file mode 100644 index 0000000000000..5a3e9e3304d01 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/free_function_kernels.cpp @@ -0,0 +1,45 @@ +// Tests compatibility with free function kernels extension + +#include "../graph_common.hpp" + +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel)) +void ff_0(int *Ptr) { + for (size_t i{0}; i < Size; ++i) { + Ptr[i] = i; + } +} + +int main() { + queue Queue{}; + context Ctxt{Queue.get_context()}; + + exp_ext::command_graph Graph{Ctxt, Queue.get_device()}; + + int *PtrA = malloc_device(Size, Queue); + + std::vector HostDataA(Size); + + Queue.memset(PtrA, 0, Size * sizeof(int)).wait(); + +#ifndef __SYCL_DEVICE_ONLY__ + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id Kernel_id = exp_ext::get_kernel_id(); + kernel Kernel = Bundle.get_kernel(Kernel_id); + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, PtrA); + cgh.single_task(Kernel); + }); + + auto ExecGraph = Graph.finalize(); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == i); + } +#endif + sycl::free(PtrA, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/interop-level-zero-get-native-mem.cpp b/sycl/test-e2e/Graph/Inputs/interop-level-zero-get-native-mem.cpp index 56ce712b8d410..59ca0390a80ab 100644 --- a/sycl/test-e2e/Graph/Inputs/interop-level-zero-get-native-mem.cpp +++ b/sycl/test-e2e/Graph/Inputs/interop-level-zero-get-native-mem.cpp @@ -1,6 +1,6 @@ // Tests interop with host_task in graph // This test was taken from -// `sycl/test-e2e/Plugin/interop-level-zero-get-native-mem.cpp` This test has +// `sycl/test-e2e/Adapter/interop-level-zero-get-native-mem.cpp` This test has // been simplified to only work with signle device. #include "../graph_common.hpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/free_function_kernels.cpp b/sycl/test-e2e/Graph/RecordReplay/free_function_kernels.cpp new file mode 100644 index 0000000000000..ad003e575e044 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/free_function_kernels.cpp @@ -0,0 +1,13 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/free_function_kernels.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/interop-level-zero-get-native-mem.cpp b/sycl/test-e2e/Graph/RecordReplay/interop-level-zero-get-native-mem.cpp index 551e463f6d887..a41645599b66e 100644 --- a/sycl/test-e2e/Graph/RecordReplay/interop-level-zero-get-native-mem.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/interop-level-zero-get-native-mem.cpp @@ -1,5 +1,5 @@ // REQUIRES: level_zero, level_zero_dev_kit -// L0 plugin incorrectly reports memory leaks because it doesn't take into +// L0 adapter incorrectly reports memory leaks because it doesn't take into // account direct calls to L0 API. // UNSUPPORTED: ze_debug // RUN: %{build} %level_zero_options -o %t.out diff --git a/sycl/test-e2e/Graph/RecordReplay/interop-level-zero-launch-kernel.cpp b/sycl/test-e2e/Graph/RecordReplay/interop-level-zero-launch-kernel.cpp index 894c35e995152..77bd7ab9fa1b3 100644 --- a/sycl/test-e2e/Graph/RecordReplay/interop-level-zero-launch-kernel.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/interop-level-zero-launch-kernel.cpp @@ -1,5 +1,5 @@ // REQUIRES: level_zero, level_zero_dev_kit -// L0 plugin incorrectly reports memory leaks because it doesn't take into +// L0 adapter incorrectly reports memory leaks because it doesn't take into // account direct calls to the L0 API. // UNSUPPORTED: ze_debug // RUN: %{build} %level_zero_options -o %t.out diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/free_function_kernels.hpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/free_function_kernels.hpp new file mode 100644 index 0000000000000..d319d819abdad --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/free_function_kernels.hpp @@ -0,0 +1,57 @@ +#pragma once + +#include "../../graph_common.hpp" +#include "sycl/ext/oneapi/kernel_properties/properties.hpp" +#include "sycl/kernel_bundle.hpp" +#include + +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel)) +void ff_0(int *Ptr) { + for (size_t i{0}; i < Size; ++i) { + Ptr[i] = i; + } +} + +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel)) +void ff_1(int *Ptr) { + for (size_t i{0}; i < Size; ++i) { + Ptr[i] += i; + } +} + +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel)) +void ff_2(int *Ptr, size_t Size, size_t NumKernelLoops) { + for (size_t j{0}; j < NumKernelLoops; j++) { + for (size_t i{0}; i < Size; i++) { + Ptr[i] += i; + } + } +} + +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::nd_range_kernel<3>)) +void ff_3(int *Ptr) { + size_t GlobalID = + ext::oneapi::this_work_item::get_nd_item<3>().get_global_linear_id(); + Ptr[GlobalID] = GlobalID; +} + +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::nd_range_kernel<3>)) +void ff_4(int *Ptr) { + size_t GlobalID = + ext::oneapi::this_work_item::get_nd_item<3>().get_global_linear_id(); + Ptr[GlobalID] *= 2; +} + +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::nd_range_kernel<1>)) +void ff_5(int *PtrA, int *PtrB, int *PtrC) { + size_t GlobalID = + ext::oneapi::this_work_item::get_nd_item<1>().get_global_id(); + PtrC[GlobalID] += PtrA[GlobalID] * PtrB[GlobalID]; +} + +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel)) +void ff_6(int *Ptr, int ScalarValue) { + for (size_t i{0}; i < Size; ++i) { + Ptr[i] = ScalarValue; + } +} diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_before_finalize.cpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_before_finalize.cpp new file mode 100644 index 0000000000000..5d0ac58cf0de9 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_before_finalize.cpp @@ -0,0 +1,60 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +// Tests updating a graph node before finalization + +#include "../../graph_common.hpp" +#include "free_function_kernels.hpp" + +int main() { + queue Queue{}; + context Ctxt{Queue.get_context()}; + + exp_ext::command_graph Graph{Ctxt, Queue.get_device()}; + + int *PtrA = malloc_device(Size, Queue); + int *PtrB = malloc_device(Size, Queue); + + std::vector HostDataA(Size); + std::vector HostDataB(Size); + + Queue.memset(PtrA, 0, Size * sizeof(int)).wait(); + Queue.memset(PtrB, 0, Size * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, PtrA); + +#ifndef __SYCL_DEVICE_ONLY__ + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id Kernel_id = exp_ext::get_kernel_id(); + kernel Kernel = Bundle.get_kernel(Kernel_id); + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, InputParam); + cgh.single_task(Kernel); + }); + // Swap PtrB to be the input + InputParam.update(PtrB); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // Only PtrB should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + Queue.copy(PtrB, HostDataB.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == 0); + assert(HostDataB[i] == i); + } +#endif + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_multiple_exec_graphs.cpp new file mode 100644 index 0000000000000..269a5b5373638 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_multiple_exec_graphs.cpp @@ -0,0 +1,77 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +// Tests creating multiple executable graphs from the same modifiable graph and +// only updating one of them. + +#include "../../graph_common.hpp" +#include "free_function_kernels.hpp" + +int main() { + queue Queue{}; + context Ctxt{Queue.get_context()}; + + exp_ext::command_graph Graph{Ctxt, Queue.get_device()}; + + int *PtrA = malloc_device(Size, Queue); + int *PtrB = malloc_device(Size, Queue); + + std::vector HostDataA(Size); + std::vector HostDataB(Size); + + Queue.memset(PtrA, 0, Size * sizeof(int)).wait(); + Queue.memset(PtrB, 0, Size * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, PtrA); + +#ifndef __SYCL_DEVICE_ONLY__ + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id Kernel_id = exp_ext::get_kernel_id(); + kernel Kernel = Bundle.get_kernel(Kernel_id); + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, InputParam); + cgh.single_task(Kernel); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + auto ExecGraph2 = Graph.finalize(exp_ext::property::graph::updatable{}); + + // PtrA values should be modified twice + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.ext_oneapi_graph(ExecGraph2).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + Queue.copy(PtrB, HostDataB.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == 0); + } + + // Swap PtrB to be the input + InputParam.update(PtrB); + // Only update ExecGraph, which should now modify PtrB while ExecGraph2 + // modifies PtrA still + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.ext_oneapi_graph(ExecGraph2).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + Queue.copy(PtrB, HostDataB.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + // A should have been modified 3 times by now, B only once + assert(HostDataA[i] == i * 3); + assert(HostDataB[i] == i); + } +#endif + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ordering.cpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ordering.cpp new file mode 100644 index 0000000000000..4d6aa6445cd0e --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ordering.cpp @@ -0,0 +1,80 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +// Tests that updating a graph is ordered with respect to previous executions of +// the graph which may be in flight. + +#include "../../graph_common.hpp" +#include "free_function_kernels.hpp" + +int main() { + queue Queue{}; + context Ctxt{Queue.get_context()}; + + // Use a large N to try and make the kernel slow + const size_t N = 1 << 16; + // Loop inside kernel to make even slower (too large N runs out of memory) + const size_t NumKernelLoops = 4; + const size_t NumSubmitLoops = 8; + + exp_ext::command_graph Graph{Ctxt, Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + Queue.memset(PtrB, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, PtrA); + +#ifndef __SYCL_DEVICE_ONLY__ + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id Kernel_id = exp_ext::get_kernel_id(); + kernel Kernel = Bundle.get_kernel(Kernel_id); + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, InputParam); + cgh.set_arg(1, N); + cgh.set_arg(2, NumKernelLoops); + cgh.single_task(Kernel); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // Submit a bunch of graphs without waiting + for (size_t i = 0; i < NumSubmitLoops; i++) { + Queue.ext_oneapi_graph(ExecGraph); + } + + // Swap PtrB to be the input + InputParam.update(PtrB); + + ExecGraph.update(KernelNode); + + // Submit another set of graphs then wait on all submissions + for (size_t i = 0; i < NumSubmitLoops; i++) { + Queue.ext_oneapi_graph(ExecGraph); + } + Queue.wait_and_throw(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i * NumKernelLoops * NumSubmitLoops); + assert(HostDataB[i] == i * NumKernelLoops * NumSubmitLoops); + } +#endif + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr.cpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr.cpp new file mode 100644 index 0000000000000..64fc1408cee4a --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr.cpp @@ -0,0 +1,70 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +// Tests updating a graph node using index-based explicit update + +#include "../../graph_common.hpp" +#include "free_function_kernels.hpp" + +int main() { + queue Queue{}; + context Ctxt{Queue.get_context()}; + + exp_ext::command_graph Graph{Ctxt, Queue.get_device()}; + + int *PtrA = malloc_device(Size, Queue); + int *PtrB = malloc_device(Size, Queue); + + std::vector HostDataA(Size); + std::vector HostDataB(Size); + + Queue.memset(PtrA, 0, Size * sizeof(int)).wait(); + Queue.memset(PtrB, 0, Size * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, PtrA); + +#ifndef __SYCL_DEVICE_ONLY__ + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id Kernel_id = exp_ext::get_kernel_id(); + kernel Kernel = Bundle.get_kernel(Kernel_id); + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, InputParam); + cgh.single_task(Kernel); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + Queue.copy(PtrB, HostDataB.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == 0); + } + + // Swap PtrB to be the input + InputParam.update(PtrB); + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + Queue.copy(PtrB, HostDataB.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == i); + } +#endif + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_3D.cpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_3D.cpp new file mode 100644 index 0000000000000..644478f9ddfb4 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_3D.cpp @@ -0,0 +1,87 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +// Tests updating a 3D ND-Range graph kernel node using index-based explicit +// update + +#include "../../graph_common.hpp" +#include "free_function_kernels.hpp" + +int main() { + queue Queue{}; + context Ctxt{Queue.get_context()}; + + const range<3> GlobalWorkSize(1, 2, 2); + const range<3> LocalWorkSize(1, 2, 2); + const size_t N = GlobalWorkSize[0] * GlobalWorkSize[1] * GlobalWorkSize[2]; + + exp_ext::command_graph Graph{Ctxt, Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + Queue.memset(PtrB, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter DynParam(Graph, PtrA); + + nd_range<3> NDRange{GlobalWorkSize, LocalWorkSize}; + +#ifndef __SYCL_DEVICE_ONLY__ + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id Kernel_id_A = exp_ext::get_kernel_id(); + kernel Kernel_A = Bundle.get_kernel(Kernel_id_A); + auto NodeA = Graph.add([&](handler &cgh) { + cgh.set_arg(0, DynParam); + cgh.parallel_for(NDRange, Kernel_A); + }); + + kernel_id Kernel_id_B = exp_ext::get_kernel_id(); + kernel Kernel_B = Bundle.get_kernel(Kernel_id_B); + auto NodeB = Graph.add( + [&](handler &cgh) { + cgh.set_arg(0, DynParam); + cgh.parallel_for(NDRange, Kernel_B); + }, + exp_ext::property::node::depends_on{NodeA}); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == (i * 2)); + assert(HostDataB[i] == 0); + } + + // Swap PtrB to be the input/output + DynParam.update(PtrB); + ExecGraph.update({NodeA, NodeB}); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + const size_t Ref = i * 2; + assert(HostDataA[i] == Ref); + assert(HostDataB[i] == Ref); + } +#endif + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_double_update.cpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_double_update.cpp new file mode 100644 index 0000000000000..1c66a03eac4d3 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_double_update.cpp @@ -0,0 +1,81 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +// Tests updating a graph node using index-based explicit update + +#include "../../graph_common.hpp" +#include "free_function_kernels.hpp" + +int main() { + queue Queue{}; + context Ctxt{Queue.get_context()}; + + exp_ext::command_graph Graph{Ctxt, Queue.get_device()}; + + int *PtrA = malloc_device(Size, Queue); + int *PtrB = malloc_device(Size, Queue); + int *PtrUnused = malloc_device(Size, Queue); + + std::vector HostDataA(Size); + std::vector HostDataB(Size); + std::vector HostDataUnused(Size); + + Queue.memset(PtrA, 0, Size * sizeof(int)).wait(); + Queue.memset(PtrB, 0, Size * sizeof(int)).wait(); + Queue.memset(PtrUnused, 0, Size * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, PtrA); + +#ifndef __SYCL_DEVICE_ONLY__ + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id Kernel_id = exp_ext::get_kernel_id(); + kernel Kernel = Bundle.get_kernel(Kernel_id); + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, InputParam); + cgh.single_task(Kernel); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + Queue.copy(PtrB, HostDataB.data(), Size).wait(); + Queue.copy(PtrUnused, HostDataUnused.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == 0); + assert(HostDataUnused[i] == 0); + } + + // Swap PtrUnused to be the input, then swap to PtrB without executing + InputParam.update(PtrUnused); + InputParam.update(PtrB); + + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + Queue.copy(PtrB, HostDataB.data(), Size).wait(); + Queue.copy(PtrUnused, HostDataUnused.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == i); + // Check that PtrUnused was never actually used in a kernel + assert(HostDataUnused[i] == 0); + } +#endif + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + sycl::free(PtrUnused, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_multiple_nodes.cpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_multiple_nodes.cpp new file mode 100644 index 0000000000000..1143b1edf855a --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_multiple_nodes.cpp @@ -0,0 +1,80 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +// Tests updating a single dynamic parameter which is registered with multiple +// graph nodes + +#include "../../graph_common.hpp" +#include "free_function_kernels.hpp" + +int main() { + queue Queue{}; + context Ctxt{Queue.get_context()}; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(Size, Queue); + int *PtrB = malloc_device(Size, Queue); + + std::vector HostDataA(Size); + std::vector HostDataB(Size); + + Queue.memset(PtrA, 0, Size * sizeof(int)).wait(); + Queue.memset(PtrB, 0, Size * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, PtrA); + +#ifndef __SYCL_DEVICE_ONLY__ + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id Kernel_id_A = exp_ext::get_kernel_id(); + kernel Kernel_A = Bundle.get_kernel(Kernel_id_A); + auto KernelNodeA = Graph.add([&](handler &cgh) { + cgh.set_arg(0, InputParam); + cgh.single_task(Kernel_A); + }); + + kernel_id Kernel_id_B = exp_ext::get_kernel_id(); + kernel Kernel_B = Bundle.get_kernel(Kernel_id_B); + auto KernelNodeB = Graph.add( + [&](handler &cgh) { + cgh.set_arg(0, InputParam); + cgh.single_task(Kernel_B); + }, + exp_ext::property::node::depends_on{KernelNodeA}); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + Queue.copy(PtrB, HostDataB.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == 0); + } + + // Swap PtrB to be the input + InputParam.update(PtrB); + ExecGraph.update({KernelNodeA, KernelNodeB}); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + Queue.copy(PtrB, HostDataB.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == i * 2); + } +#endif + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_multiple_params.cpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_multiple_params.cpp new file mode 100644 index 0000000000000..76c66a27838e1 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_multiple_params.cpp @@ -0,0 +1,88 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +// Tests updating multiple parameters to a singlegraph node using index-based +// explicit update + +#include "../../graph_common.hpp" +#include "free_function_kernels.hpp" + +int main() { + queue Queue{}; + context Ctxt{Queue.get_context()}; + + exp_ext::command_graph Graph{Ctxt, Queue.get_device()}; + + int *PtrA = malloc_device(Size, Queue); + int *PtrB = malloc_device(Size, Queue); + int *PtrC = malloc_device(Size, Queue); + + std::vector HostDataA(Size); + std::vector HostDataB(Size); + std::vector HostDataC(Size); + std::vector OutData(Size); + + std::iota(HostDataA.begin(), HostDataA.end(), 10); + std::iota(HostDataB.begin(), HostDataB.end(), 100); + + Queue.memcpy(PtrA, HostDataA.data(), Size * sizeof(int)).wait(); + Queue.memcpy(PtrB, HostDataB.data(), Size * sizeof(int)).wait(); + Queue.memset(PtrC, 0, Size * sizeof(int)).wait(); + + exp_ext::dynamic_parameter ParamA(Graph, PtrA); + exp_ext::dynamic_parameter ParamB(Graph, PtrB); + exp_ext::dynamic_parameter ParamOut(Graph, PtrC); + + nd_range<1> NDRange{Size, 32}; + +#ifndef __SYCL_DEVICE_ONLY__ + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id Kernel_id = exp_ext::get_kernel_id(); + kernel Kernel = Bundle.get_kernel(Kernel_id); + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, ParamA); + cgh.set_arg(1, ParamB); + cgh.set_arg(2, ParamOut); + cgh.parallel_for(NDRange, Kernel); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + // Copy to output data to preserve original data for verifying += op + Queue.copy(PtrC, OutData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(OutData[i] == HostDataC[i] + (HostDataA[i] * HostDataB[i])); + } + + // Update C's host data + HostDataC = OutData; + + // Swap PtrB to be the input + ParamOut.update(PtrB); + ParamB.update(PtrC); + + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + // Copy to output data to preserve original data for verifying += op + Queue.copy(PtrB, OutData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(OutData[i] == HostDataB[i] + (HostDataA[i] * HostDataC[i])); + } +#endif + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + sycl::free(PtrC, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_subgraph.cpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_subgraph.cpp new file mode 100644 index 0000000000000..d8b4083fed1e7 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_subgraph.cpp @@ -0,0 +1,79 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +// Tests updating a graph node in an executable graph that was used as a +// subgraph node in another executable graph is not reflected in the graph +// containing the subgraph node. + +#include "../../graph_common.hpp" +#include "free_function_kernels.hpp" + +int main() { + queue Queue{}; + context Ctxt{Queue.get_context()}; + + exp_ext::command_graph Graph{Ctxt, Queue.get_device()}; + exp_ext::command_graph SubGraph{Ctxt, Queue.get_device()}; + + int *PtrA = malloc_device(Size, Queue); + int *PtrB = malloc_device(Size, Queue); + + std::vector HostDataA(Size); + std::vector HostDataB(Size); + + Queue.memset(PtrA, 0, Size * sizeof(int)).wait(); + Queue.memset(PtrB, 0, Size * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(SubGraph, PtrA); + +#ifndef __SYCL_DEVICE_ONLY__ + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id SubKernel_id = exp_ext::get_kernel_id(); + kernel SubKernel = Bundle.get_kernel(SubKernel_id); + auto SubKernelNode = SubGraph.add([&](handler &cgh) { + cgh.set_arg(0, InputParam); + cgh.single_task(SubKernel); + }); + + auto SubExecGraph = SubGraph.finalize(exp_ext::property::graph::updatable{}); + + kernel_id Kernel_id = exp_ext::get_kernel_id(); + kernel Kernel = Bundle.get_kernel(Kernel_id); + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, PtrA); + cgh.single_task(Kernel); + }); + + Graph.add([&](handler &cgh) { cgh.ext_oneapi_graph(SubExecGraph); }, + exp_ext::property::node::depends_on{KernelNode}); + + // Finalize the parent graph with the original values + auto ExecGraph = Graph.finalize(); + + // Swap PtrB to be the input + InputParam.update(PtrB); + // Update the executable graph that was used as a subgraph with the new value, + // this should not affect ExecGraph + SubExecGraph.update(SubKernelNode); + // Only PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + Queue.copy(PtrB, HostDataB.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == 0); + } +#endif + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_scalar.cpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_scalar.cpp new file mode 100644 index 0000000000000..e69e4097e4b50 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_scalar.cpp @@ -0,0 +1,65 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +// Tests updating a graph node scalar argument using index-based explicit update + +#include "../../graph_common.hpp" +#include "free_function_kernels.hpp" + +int main() { + queue Queue{}; + context Ctxt{Queue.get_context()}; + + exp_ext::command_graph Graph{Ctxt, Queue.get_device()}; + + int *DeviceData = malloc_device(Size, Queue); + + int ScalarValue = 17; + + std::vector HostData(Size); + + Queue.memset(DeviceData, 0, Size * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, ScalarValue); + +#ifndef __SYCL_DEVICE_ONLY__ + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id Kernel_id = exp_ext::get_kernel_id(); + kernel Kernel = Bundle.get_kernel(Kernel_id); + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, DeviceData); + cgh.set_arg(1, InputParam); + cgh.single_task(Kernel); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // DeviceData should be filled with current ScalarValue (17) + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(DeviceData, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostData[i] == 17); + } + + // Update ScalarValue to be 99 instead + InputParam.update(99); + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(DeviceData, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostData[i] == 99); + } +#endif + sycl::free(DeviceData, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor.cpp index a5e5a1ea78b87..803f296d9f71a 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor.cpp @@ -5,9 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-307 - // Tests using dynamic command-group objects with buffer accessors #include "../graph_common.hpp" @@ -23,14 +20,14 @@ int main() { Queue.get_device(), {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; - int PatternA = 42; + const int PatternA = 42; auto CGFA = [&](handler &CGH) { CGH.require(Acc); CGH.parallel_for(Size, [=](item<1> Item) { Acc[Item.get_id()] = PatternA; }); }; - int PatternB = 0xA; + const int PatternB = 0xA; auto CGFB = [&](handler &CGH) { CGH.require(Acc); CGH.parallel_for(Size, diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps.cpp index 7b477edacff98..a8018190ab741 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps.cpp @@ -5,9 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-307 - // Tests adding a dynamic command-group node to a graph using buffer // accessors for the node edges. diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps2.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps2.cpp index a420d7deb58de..4e9ada8a3c246 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps2.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps2.cpp @@ -5,9 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-307 - // Tests adding a dynamic command-group node to a graph using buffer // accessors for the node edges, but where different command-groups // use different buffers that create identical edges. diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_spv.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_spv.cpp index 0eaa714463670..08b5fa293cf80 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_spv.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_spv.cpp @@ -1,13 +1,11 @@ // RUN: %{build} -o %t.out -// RUN: %{run} %t.out %S/../Inputs/Kernels/update_with_indices_accessor.spv +// RUN: %{run} %t.out %S/../Inputs/Kernels/dyn_cgf_accessor.spv // Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG -// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out %S/../Inputs/Kernels/update_with_indices_accessor.spv 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out %S/../Inputs/Kernels/dyn_cgf_accessor.spv 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // Extra run to check for immediate-command-list in Level Zero -// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out %S/../Inputs/Kernels/update_with_indices_accessor.spv 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out %S/../Inputs/Kernels/dyn_cgf_accessor.spv 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // REQUIRES: level_zero -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-307 // Tests updating an accessor argument to a graph node created from SPIR-V // using dynamic command-groups. @@ -23,8 +21,12 @@ int main(int, char **argv) { return bundle.ext_oneapi_get_kernel(name); }; - kernel kernel = getKernel( - KernelBundle, "_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlvE_"); + kernel kernelA = getKernel( + KernelBundle, + "_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlNS0_4itemILi1ELb1EEEE_"); + kernel kernelB = getKernel( + KernelBundle, + "_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_EUlNS0_4itemILi1ELb1EEEE_"); exp_ext::command_graph Graph{ Queue.get_context(), @@ -36,22 +38,27 @@ int main(int, char **argv) { BufA.set_write_back(false); BufB.set_write_back(false); + int PatternA = 42; + int PatternB = 0xA; + auto AccA = BufA.get_access(); auto AccB = BufB.get_access(); auto CGFA = [&](handler &CGH) { CGH.require(AccA); CGH.set_arg(0, AccA); - CGH.single_task(kernel); + CGH.set_arg(2, PatternA); + CGH.parallel_for(sycl::range<1>(Size), kernelA); }; auto CGFB = [&](handler &CGH) { CGH.require(AccB); CGH.set_arg(0, AccB); - CGH.single_task(kernel); + CGH.set_arg(2, PatternB); + CGH.parallel_for(sycl::range<1>(Size), kernelB); }; - auto DynamicCG = exp_ext::dynamic_command_group(Queue, {CGFA, CGFB}); + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); auto DynamicCGNode = Graph.add(DynamicCG); auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); @@ -62,8 +69,8 @@ int main(int, char **argv) { Queue.copy(BufA.get_access(), HostDataA.data()).wait(); Queue.copy(BufB.get_access(), HostDataB.data()).wait(); for (size_t i = 0; i < Size; i++) { - assert(HostDataA[i] == i); - assert(HostDataB[i] == 0); + assert(check_value(i, PatternA, HostDataA[i], "HostDataA")); + assert(check_value(i, 0, HostDataB[i], "HostDataB")); } DynamicCG.set_active_cgf(1); @@ -74,8 +81,8 @@ int main(int, char **argv) { Queue.copy(BufA.get_access(), HostDataA.data()).wait(); Queue.copy(BufB.get_access(), HostDataB.data()).wait(); for (size_t i = 0; i < Size; i++) { - assert(HostDataA[i] == i); - assert(HostDataB[i] == i); + assert(check_value(i, PatternA, HostDataA[i], "HostDataA")); + assert(check_value(i, PatternB, HostDataB[i], "HostDataB")); } return 0; } diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_different_arg_nums.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_different_arg_nums.cpp index 9b3bea4bceaff..7288fba3a73d1 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_different_arg_nums.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_different_arg_nums.cpp @@ -1,12 +1,9 @@ // RUN: %{build} -o %t.out // RUN: env SYCL_UR_TRACE=2 %{run} %t.out | FileCheck %s // Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG -// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 env SYCL_UR_TRACE=2 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // Extra run to check for immediate-command-list in Level Zero -// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} - -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-307 +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 env SYCL_UR_TRACE=2 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // Tests updating a dynamic command-group with command-groups containing a // different number of arguments. diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_event_deps.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_event_deps.cpp index 0964f6e0c354e..9556f97de69f1 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_event_deps.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_event_deps.cpp @@ -5,9 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-307 - // Tests adding a dynamic command-group node to a graph using graph limited // events for dependencies. diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange.cpp index 237e9173f253e..cbe1c2c3e117a 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange.cpp @@ -5,9 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-307 - // Tests updating a dynamic command-group node where the dynamic command-groups // have different ranges/nd-ranges diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange_3D.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange_3D.cpp index 261ac6ecf5c3b..3fd32ef575cf4 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange_3D.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange_3D.cpp @@ -5,9 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-307 - // Tests updating a dynamic command-group node where the dynamic command-groups // have different range/nd-range dimensions diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_parameters.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_parameters.cpp index 7049b5bdde305..7f00d0f8750ce 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_parameters.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_parameters.cpp @@ -5,9 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-307 - // Tests updating kernel code using dynamic command-groups that have different // parameters in each command-group. diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_shared_nodes.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_shared_nodes.cpp index 1f98200791b6c..eab640b45b258 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_shared_nodes.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_shared_nodes.cpp @@ -5,9 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-307 - // Tests using the same dynamic command-group in more than one graph node. #include "../graph_common.hpp" diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp index 100701f7b62aa..97c454b6db92a 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp @@ -5,9 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-307 - // Tests updating usm kernel code using dynamic command-groups #include "../graph_common.hpp" diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_with_all_dyn_params.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_with_all_dyn_params.cpp index 53b34d1add289..28a55ecfeceeb 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_with_all_dyn_params.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_with_all_dyn_params.cpp @@ -5,9 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-307 - // Tests using a dynamic command-group object with dynamic parameters inside it #include "../graph_common.hpp" diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_with_different_type_dyn_params.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_with_different_type_dyn_params.cpp index 00482185ebc27..925839729cce8 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_with_different_type_dyn_params.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_with_different_type_dyn_params.cpp @@ -5,9 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-307 - // Tests using a dynamic command-group object with dynamic parameters of // different types diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_with_some_dyn_params.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_with_some_dyn_params.cpp index 3213fc4eec2fe..6ee6dafaaea60 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_with_some_dyn_params.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_with_some_dyn_params.cpp @@ -5,9 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-307 - // Tests using a dynamic command-group object where some but not all the // command-groups use dynamic parameters. diff --git a/sycl/test-e2e/Graph/Update/whole_update_dynamic_cgf.cpp b/sycl/test-e2e/Graph/Update/whole_update_dynamic_cgf.cpp index 0e8b87c0725f2..bf40f1baf7661 100644 --- a/sycl/test-e2e/Graph/Update/whole_update_dynamic_cgf.cpp +++ b/sycl/test-e2e/Graph/Update/whole_update_dynamic_cgf.cpp @@ -5,9 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-307 - // Tests interaction of whole graph update and dynamic command-groups #include "../graph_common.hpp" diff --git a/sycl/test-e2e/Graph/graph_common.hpp b/sycl/test-e2e/Graph/graph_common.hpp index d60049a71cca3..4b5a4c6c9b41f 100644 --- a/sycl/test-e2e/Graph/graph_common.hpp +++ b/sycl/test-e2e/Graph/graph_common.hpp @@ -1,3 +1,5 @@ +#pragma once + #include #include diff --git a/sycl/test-e2e/KernelAndProgram/level-zero-static-link-flow.cpp b/sycl/test-e2e/KernelAndProgram/level-zero-static-link-flow.cpp index 2360554318092..55402a0f44a5b 100644 --- a/sycl/test-e2e/KernelAndProgram/level-zero-static-link-flow.cpp +++ b/sycl/test-e2e/KernelAndProgram/level-zero-static-link-flow.cpp @@ -6,7 +6,7 @@ //==--- level-zero-static-link-flow.cpp.cpp - Check L0 static link flow --==// // // Run a simple program that uses online linking and verify that the sequence -// of calls to the plugin and to the Level Zero driver are consistent with the +// of calls to the adapter and to the Level Zero driver are consistent with the // "static linking" implementation. // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. diff --git a/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl_jit.cpp b/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl_jit.cpp index 3402691453287..563f75d313e95 100644 --- a/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl_jit.cpp +++ b/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl_jit.cpp @@ -56,7 +56,7 @@ auto constexpr SYCLSource = R"===( // use extern "C" to avoid name mangling extern "C" SYCL_EXTERNAL SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((sycl::ext::oneapi::experimental::nd_range_kernel<1>)) -void ff_cp(int *ptr) { +void ff_cp(int *ptr, int *unused) { // intentionally using deprecated routine, as opposed to this_work_item::get_nd_item<1>() sycl::nd_item<1> Item = sycl::ext::oneapi::experimental::this_nd_item<1>(); @@ -64,6 +64,17 @@ void ff_cp(int *ptr) { sycl::id<1> GId = Item.get_global_id(); ptr[GId.get(0)] = AddEm(GId.get(0), 37); } + +// this name will be mangled +template +SYCL_EXTERNAL SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((sycl::ext::oneapi::experimental::nd_range_kernel<1>)) +void ff_templated(T *ptr, T *unused) { + + sycl::nd_item<1> Item = sycl::ext::oneapi::this_work_item::get_nd_item<1>(); + + sycl::id<1> GId = Item.get_global_id(); + ptr[GId.get(0)] = PlusEm(GId.get(0), 38); +} )==="; void test_1(sycl::queue &Queue, sycl::kernel &Kernel, int seed) { @@ -78,6 +89,7 @@ void test_1(sycl::queue &Queue, sycl::kernel &Kernel, int seed) { memset(usmPtr, 0, Range * sizeof(int)); Queue.submit([&](sycl::handler &Handler) { Handler.set_arg(0, usmPtr); + Handler.set_arg(1, usmPtr); Handler.parallel_for(R1, Kernel); }); Queue.wait(); @@ -125,19 +137,32 @@ int test_build_and_run() { // Compilation of empty prop list, no devices. exe_kb kbExe1 = syclex::build(kbSrc); - // // Compilation with props and devices + // Compilation with props and devices std::string log; std::vector flags{"-g", "-fno-fast-math", "-fsycl-instrument-device-code"}; std::vector devs = kbSrc.get_devices(); exe_kb kbExe2 = syclex::build( - kbSrc, devs, syclex::properties{syclex::build_options{flags}}); + kbSrc, devs, + syclex::properties{syclex::build_options{flags}, syclex::save_log{&log}, + syclex::registered_kernel_names{"ff_templated"}}); - // extern "C" was used, so the name "ff_cp" is not mangled. + // extern "C" was used, so the name "ff_cp" is not mangled and can be used + // directly. sycl::kernel k = kbExe2.ext_oneapi_get_kernel("ff_cp"); + // The templated function name will have been mangled. Mapping from original + // name to mangled is not yet supported. So we cannot yet do this: + // sycl::kernel k2 = kbExe2.ext_oneapi_get_kernel("ff_templated"); + + // Instead, we can TEMPORARILY use the mangled name. Once demangling is + // supported this might no longer work. + sycl::kernel k2 = + kbExe2.ext_oneapi_get_kernel("_Z26__sycl_kernel_ff_templatedIiEvPT_S1_"); + // Test the kernels. - test_1(q, k, 37 + 5); // ff_cp seeds 37. AddEm will add 5 more. + test_1(q, k, 37 + 5); // ff_cp seeds 37. AddEm will add 5 more. + test_1(q, k2, 38 + 6); // ff_templated seeds 38. PlusEm adds 6 more. return 0; } @@ -181,6 +206,7 @@ int test_unsupported_options() { CheckUnsupported({"-Xarch_device", "-fsanitize=address"}); CheckUnsupported({"-fsycl-device-code-split=kernel"}); CheckUnsupported({"-fsycl-device-code-split-esimd"}); + CheckUnsupported({"-fsycl-dead-args-optimization"}); return 0; } diff --git a/sycl/test-e2e/OneapiDeviceSelector/illegal_input_hang.cpp b/sycl/test-e2e/OneapiDeviceSelector/illegal_input_hang.cpp index 1c763fef3a8ba..b89b51c3c0266 100644 --- a/sycl/test-e2e/OneapiDeviceSelector/illegal_input_hang.cpp +++ b/sycl/test-e2e/OneapiDeviceSelector/illegal_input_hang.cpp @@ -4,7 +4,7 @@ #include // Check that the application does not hang when we attempt -// to initialize plugins multiple times with invalid values +// to initialize adapters multiple times with invalid values // of ONEAPI_DEVICE_SELECTOR. int main() { for (int I = 0; I < 3; ++I) { diff --git a/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp b/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp index 27365433248a0..e1309ee0edb1a 100644 --- a/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp +++ b/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp @@ -4,7 +4,7 @@ // XFAIL: hip_nvidia // The tested functionality is disabled with Level Zero until it is supported by -// the plugin. +// the adapter. // UNSUPPORTED: level_zero //==----------------------- InOrderQueueDeps.cpp ---------------------------==// // diff --git a/sycl/test-e2e/Tracing/buffer_printers.cpp b/sycl/test-e2e/Tracing/buffer_printers.cpp index f5c8521ce6ec5..4e29cbb02ff6c 100644 --- a/sycl/test-e2e/Tracing/buffer_printers.cpp +++ b/sycl/test-e2e/Tracing/buffer_printers.cpp @@ -6,7 +6,7 @@ #include #include -// Test image-specific printers of the Plugin Interace +// Test the buffer-specific printers of UR tracing. // //CHECK: <--- urEnqueueMemBufferCopyRect( //CHECK-SAME: .srcOrigin = (struct ur_rect_offset_t){.x = 64, .y = 5, .z = 0} diff --git a/sycl/test-e2e/Tracing/image_printers.cpp b/sycl/test-e2e/Tracing/image_printers.cpp index c176a699d5e1a..3556fa4c0c7b4 100644 --- a/sycl/test-e2e/Tracing/image_printers.cpp +++ b/sycl/test-e2e/Tracing/image_printers.cpp @@ -3,7 +3,7 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// Test image-specific printers of the Plugin Interace +// Test the image-specific printers of UR tracing. // // CHECK: <--- urMemImageCreate( // CHECK-SAME: image_desc w/h/d : 4 / 4 / 1 -- arrSz/row/slice : 0 / 64 / 256 -- num_mip_lvls/num_smpls/image_type : 0 / 0 / 4337 diff --git a/sycl/test-e2e/USM/usm_pooling.cpp b/sycl/test-e2e/USM/usm_pooling.cpp index 63a083e5d47a3..258d97420a665 100644 --- a/sycl/test-e2e/USM/usm_pooling.cpp +++ b/sycl/test-e2e/USM/usm_pooling.cpp @@ -88,11 +88,11 @@ int main(int argc, char *argv[]) { context C = Q.get_context(); const char *devType = D.is_cpu() ? "CPU" : "GPU"; - std::string pluginName = + std::string adapterName = D.get_platform().get_info(); std::cout << "Running on device " << devType << " (" - << D.get_info() << ") " << pluginName - << " plugin\n"; + << D.get_info() << ") " << adapterName + << " adapter\n"; if (*argv[1] == 'h') { std::cerr << "Test zeMemAllocHost\n"; diff --git a/sycl/test-e2e/WeakObject/weak_object_utils.hpp b/sycl/test-e2e/WeakObject/weak_object_utils.hpp index d0fdaf5870d9f..e4b2a3edfbf3e 100644 --- a/sycl/test-e2e/WeakObject/weak_object_utils.hpp +++ b/sycl/test-e2e/WeakObject/weak_object_utils.hpp @@ -104,21 +104,6 @@ template