From 4fe3c4b5ffa287d6c12496137df495191986ae99 Mon Sep 17 00:00:00 2001 From: Eric Schweitz Date: Fri, 18 Oct 2024 10:08:40 -0700 Subject: [PATCH] [core, runtime] Modify the launchers to support returning results. (#2277) * Modify the launchers to support returning results. The launchers ceased really supporting dynamic kernel results at some point. This PR adds that functionality back into the mix. This support is added pervasively across the runtime library calls. Some notes: - Return values of static size continue to be supported as they were before. Specifically, these values are stored into the data buffer by the thunk adaptor so they can be returned, ultimately, to the original caller. - Return values of dynamic size follow exactly 1 of 2 possible calling conventions. The convention must be selected by the runtime layers. 1. Everything is running within a single process; i.e., this is a simulation. In this case, the kernel will create a span of data and that span will be returned to the original caller which will use it to construct the std::vector result object. 2. There are multiple processes and/or memory spaces involved. The result span will be appended to the original data packet and the new data packet will be returned as a new span by the runtime. The calling code will follow a similar process, but the data will be passed in the runtime in a pointer-free encoding. Make the cast more robust to sneak it past -Werror. Update another launchKernel override. Add some doxygen goop to try an evade CI issues. Fix the python builder errors. Signed-off-by: Eric Schweitz * Sachin's fix. Signed-off-by: Eric Schweitz * Fix build. Signed-off-by: Eric Schweitz --------- Signed-off-by: Eric Schweitz --- docs/sphinx/api/languages/cpp_api.rst | 4 + lib/Optimizer/Builder/Intrinsics.cpp | 14 +- .../Transforms/GenKernelExecution.cpp | 146 +++++++++---- lib/Optimizer/Transforms/LiftArrayAlloc.cpp | 73 ++++--- .../cudaq/platform/py_alt_launch_kernel.cpp | 20 +- python/runtime/utils/PyRemoteSimulatorQPU.cpp | 30 +-- runtime/common/BaseRemoteRESTQPU.h | 12 +- runtime/common/BaseRemoteSimulatorQPU.h | 32 +-- runtime/common/KernelWrapper.h | 8 +- runtime/common/ThunkInterface.h | 44 ++++ .../default/DefaultQuantumPlatform.cpp | 9 +- .../mqpu/custatevec/GPUEmulatedQPU.cpp | 9 +- runtime/cudaq/platform/orca/OrcaQPU.cpp | 8 +- .../cudaq/platform/orca/OrcaRemoteRESTQPU.cpp | 16 +- .../cudaq/platform/orca/OrcaRemoteRESTQPU.h | 9 +- runtime/cudaq/platform/qpu.h | 9 +- runtime/cudaq/platform/quantum_platform.cpp | 78 +++---- runtime/cudaq/platform/quantum_platform.h | 27 +-- runtime/cudaq/qis/remote_state.cpp | 3 +- targettests/execution/vector_result.cpp | 52 +++++ test/Quake-QIR/argument.qke | 8 +- test/Quake-QIR/return_values.qke | 35 ++-- test/Quake/kernel_exec-1.qke | 130 +++++++----- test/Quake/kernel_exec-2.qke | 4 +- test/Quake/return_vector.qke | 196 ++++++++++-------- 25 files changed, 627 insertions(+), 349 deletions(-) create mode 100644 runtime/common/ThunkInterface.h create mode 100644 targettests/execution/vector_result.cpp diff --git a/docs/sphinx/api/languages/cpp_api.rst b/docs/sphinx/api/languages/cpp_api.rst index 1a26fb2d41..7f4728199e 100644 --- a/docs/sphinx/api/languages/cpp_api.rst +++ b/docs/sphinx/api/languages/cpp_api.rst @@ -190,6 +190,10 @@ Platform .. doxygentypedef:: cudaq::KernelExecutionTask +.. doxygenstruct:: cudaq::KernelThunkResultType + +.. doxygentypedef:: cudaq::KernelThunkType + Utilities ========= diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp index 12e430dc03..a6cc0ae477 100644 --- a/lib/Optimizer/Builder/Intrinsics.cpp +++ b/lib/Optimizer/Builder/Intrinsics.cpp @@ -232,9 +232,14 @@ static constexpr IntrinsicCode intrinsicTable[] = { })#"}, {"__nvqpp_createDynamicResult", + /* arguments: + arg0: original buffer ptr + arg1: original buffer size + arg2: ptr to span of the return data: {ptr, bytes} + arg3: offset to result slot in buffer */ {cudaq::llvmMemCopyIntrinsic, "malloc"}, R"#( - func.func private @__nvqpp_createDynamicResult(%arg0: !cc.ptr, %arg1: i64, %arg2: !cc.ptr, i64}>>) -> !cc.struct<{!cc.ptr, i64}> { + func.func private @__nvqpp_createDynamicResult(%arg0: !cc.ptr, %arg1: i64, %arg2: !cc.ptr, i64}>>, %arg3: i64) -> !cc.struct<{!cc.ptr, i64}> { %0 = cc.compute_ptr %arg2[1] : (!cc.ptr, i64}>>) -> !cc.ptr %1 = cc.load %0 : !cc.ptr %2 = arith.addi %arg1, %1 : i64 @@ -249,6 +254,9 @@ static constexpr IntrinsicCode intrinsicTable[] = { %7 = cc.undef !cc.struct<{!cc.ptr, i64}> %8 = cc.insert_value %3, %7[0] : (!cc.struct<{!cc.ptr, i64}>, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> %9 = cc.insert_value %2, %8[1] : (!cc.struct<{!cc.ptr, i64}>, i64) -> !cc.struct<{!cc.ptr, i64}> + %11 = cc.compute_ptr %10[%arg3] : (!cc.ptr>, i64) -> !cc.ptr + %12 = cc.cast %11 : (!cc.ptr) -> !cc.ptr> + cc.store %6, %12 : !cc.ptr> return %9 : !cc.struct<{!cc.ptr, i64}> })#"}, @@ -319,7 +327,7 @@ static constexpr IntrinsicCode intrinsicTable[] = { {cudaq::runtime::launchKernelFuncName, {}, R"#( - func.func private @altLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> ())#"}, + func.func private @altLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}>)#"}, {cudaq::runtime::CudaqRegisterArgsCreator, {}, @@ -346,7 +354,7 @@ static constexpr IntrinsicCode intrinsicTable[] = { {cudaq::runtime::launchKernelHybridFuncName, {}, R"#( - func.func private @hybridLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> ())#"}, + func.func private @hybridLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}>)#"}, {cudaq::llvmMemCopyIntrinsic, // llvm.memcpy.p0i8.p0i8.i64 {}, diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp index a4667ce7b5..2e45c8df96 100644 --- a/lib/Optimizer/Transforms/GenKernelExecution.cpp +++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp @@ -48,6 +48,12 @@ static bool isCodegenArgumentGather(std::size_t kind) { return kind == 0 || kind == 2; } +/// This pass adds a `.thunk` function and a rewritten C++ host +/// side (mangled) stub to the code for every entry-point kernel in the module. +/// It may also generate a `.argsCreator` function. Finally, it +/// creates registration hooks for the CUDA-Q runtime to be able to find the +/// kernel by name and, as appropriate, the `.argsCreator` +/// function. namespace { class GenerateKernelExecution : public cudaq::opt::impl::GenerateKernelExecutionBase< @@ -57,6 +63,19 @@ class GenerateKernelExecution /// Creates the function signature for a thunk function. The signature is /// always the same for all thunk functions. + /// + /// Every thunk function has an identical signature, making it callable from a + /// generic "kernel launcher" in the CUDA-Q runtime. + /// + /// This signature is defined as: `(ptr, bool) -> {ptr, i64}`. + /// + /// The first argument is a pointer to a data buffer that encodes all the + /// arguments (and static return) values to (and from) the kernel in the + /// pointer-free encoding. The second argument indicates if this call is to a + /// remote process (if true). The result is a pointer and size (span) if the + /// kernel returns a dynamically sized result, otherwise it will be + /// `{nullptr, 0}`. It is the responsibility of calling code to free any + /// dynamic result buffer(s) and convert those to `std::vector` objects. FunctionType getThunkType(MLIRContext *ctx) { auto ptrTy = cudaq::cc::PointerType::get(IntegerType::get(ctx, 8)); return FunctionType::get(ctx, {ptrTy, IntegerType::get(ctx, 1)}, @@ -769,31 +788,32 @@ class GenerateKernelExecution auto *thenBlock = builder.createBlock(reg); auto *elseBlock = builder.createBlock(reg); builder.setInsertionPointToEnd(currentBlock); + auto eleTy = structTy.getMember(offset); + auto memTy = cudaq::cc::PointerType::get(eleTy); + auto mem = builder.create( + loc, memTy, castOp, SmallVector{offset}); + auto resPtrTy = cudaq::cc::PointerType::get(call.getResult(0).getType()); + auto castMem = builder.create(loc, resPtrTy, mem); + builder.create(loc, call.getResult(0), castMem); builder.create(loc, isClientServer, thenBlock, elseBlock); builder.setInsertionPointToEnd(thenBlock); - auto gepRes = builder.create( - loc, cudaq::cc::PointerType::get(structTy.getMember(offset)), castOp, - ArrayRef{offset}); - auto resAsVec = builder.create( - loc, cudaq::cc::PointerType::get(funcTy.getResult(0)), gepRes); - builder.create(loc, call.getResult(0), resAsVec); auto resAsArg = builder.create( - loc, cudaq::cc::PointerType::get(thunkTy.getResults()[0]), gepRes); - // createDynamicResult packs the input values and the dynamic results - // into a single buffer to pass back as a message. + loc, cudaq::cc::PointerType::get(thunkTy.getResults()[0]), mem); + auto retOffset = genComputeReturnOffset(loc, builder, funcTy, structTy); + // createDynamicResult allocates a new buffer and packs the input values + // and the dynamic results into this single new buffer to pass back as a + // message. + // NB: This code only handles one dimensional vectors of static types. It + // will have to be changed if there is a need to return recursively + // dynamic structures, i.e., vectors of vectors. auto res = builder.create( loc, thunkTy.getResults()[0], "__nvqpp_createDynamicResult", - ValueRange{thunkEntry->getArgument(0), structSize, resAsArg}); + ValueRange{thunkEntry->getArgument(0), structSize, resAsArg, + retOffset}); builder.create(loc, res.getResult(0)); builder.setInsertionPointToEnd(elseBlock); - auto eleTy = structTy.getMember(offset); - auto memTy = cudaq::cc::PointerType::get(eleTy); - auto mem = builder.create( - loc, memTy, castOp, SmallVector{offset}); - auto resPtrTy = cudaq::cc::PointerType::get(call.getResult(0).getType()); - auto castMem = builder.create(loc, resPtrTy, mem); - builder.create(loc, call.getResult(0), castMem); + // For the else case, the span was already copied to the block. } else { // FIXME: Should check for recursive vector case. // If the kernel returns non-dynamic results (no spans), then take those @@ -854,8 +874,6 @@ class GenerateKernelExecution auto ptrPtrTy = cudaq::cc::PointerType::get(ptrTy); auto sret0 = builder.create( loc, ptrPtrTy, castSret, SmallVector{0}); - Value vecPtr = builder.create(loc, ptrTy, sret0); - builder.create(loc, std::nullopt, "free", ValueRange{vecPtr}); auto arrI8Ty = cudaq::cc::ArrayType::get(i8Ty); auto ptrArrTy = cudaq::cc::PointerType::get(arrI8Ty); auto buffPtr0 = builder.create(loc, ptrTy, data); @@ -1338,21 +1356,72 @@ class GenerateKernelExecution auto castLoadKernName = builder.create(loc, ptrI8Ty, loadKernName); + auto hostFuncTy = hostFunc.getFunctionType(); + assert((hostFuncTy.getResults().empty() || + (hostFuncTy.getNumResults() == 1)) && + "C++ function expected to have 0 or 1 return value"); + const bool resultVal = !hostFuncTy.getResults().empty(); + const bool kernelReturnsValue = + resultVal || cudaq::opt::factory::hasSRet(hostFunc); + Value launchResult; + Value launchResultToFree; + auto decodeLaunchResults = [&](Value spanReturned) { + if (!kernelReturnsValue) + return; + Type res0Ty = structTy.getMember(offset); + auto ptrResTy = cudaq::cc::PointerType::get(res0Ty); + auto rptr = builder.create(loc, ptrI8Ty, + spanReturned, 0); + launchResultToFree = rptr; + auto rIntPtr = builder.create(loc, i64Ty, rptr); + auto zero = builder.create(loc, 0, 64); + auto cmp = builder.create(loc, arith::CmpIPredicate::ne, + rIntPtr, zero); + auto *currentBlock = builder.getBlock(); + auto *reg = currentBlock->getParent(); + auto *thenBlock = builder.createBlock(reg); + auto *elseBlock = builder.createBlock(reg); + auto *endifBlock = builder.createBlock( + reg, reg->end(), TypeRange{ptrResTy}, SmallVector(1, loc)); + builder.setInsertionPointToEnd(currentBlock); + builder.create(loc, cmp, thenBlock, elseBlock); + builder.setInsertionPointToEnd(thenBlock); + // dynamic result was returned. + // We need to free() this buffer before the end of this function. + auto rStructPtr = + builder.create(loc, structPtrTy, rptr); + Value lRes = builder.create( + loc, ptrResTy, rStructPtr, + ArrayRef{offset}); + builder.create(loc, endifBlock, ArrayRef{lRes}); + builder.setInsertionPointToEnd(elseBlock); + // span was returned in the original buffer. + Value mRes = builder.create( + loc, ptrResTy, temp, ArrayRef{0, offset}); + builder.create(loc, endifBlock, ArrayRef{mRes}); + builder.setInsertionPointToEnd(endifBlock); + launchResult = endifBlock->getArgument(0); + }; + // Generate the call to `launchKernel`. switch (codegenKind) { case 0: { assert(vecArgPtrs && castLoadThunk); - builder.create( - loc, std::nullopt, cudaq::runtime::launchKernelHybridFuncName, + auto launch = builder.create( + loc, cudaq::opt::factory::getDynamicBufferType(ctx), + cudaq::runtime::launchKernelHybridFuncName, ArrayRef{castLoadKernName, castLoadThunk, castTemp, extendedStructSize, resultOffset, vecArgPtrs}); + decodeLaunchResults(launch.getResult(0)); } break; case 1: { assert(!vecArgPtrs && castLoadThunk); - builder.create( - loc, std::nullopt, cudaq::runtime::launchKernelFuncName, + auto launch = builder.create( + loc, cudaq::opt::factory::getDynamicBufferType(ctx), + cudaq::runtime::launchKernelFuncName, ArrayRef{castLoadKernName, castLoadThunk, castTemp, extendedStructSize, resultOffset}); + decodeLaunchResults(launch.getResult(0)); } break; case 2: { assert(vecArgPtrs && !castLoadThunk); @@ -1377,17 +1446,13 @@ class GenerateKernelExecution // result value(s) from the struct returned by `launchKernel` and return // them to our caller. SmallVector results; - auto hostFuncTy = hostFunc.getFunctionType(); - assert((hostFuncTy.getResults().empty() || - (hostFuncTy.getNumResults() == 1)) && - "C++ function expected to have 0 or 1 return value"); - const bool resultVal = !hostFuncTy.getResults().empty(); - if (resultVal || cudaq::opt::factory::hasSRet(hostFunc)) { + if (kernelReturnsValue) { + Type res0Ty = structTy.getMember(offset); + auto ptrResTy = cudaq::cc::PointerType::get(res0Ty); // Host function returns a value. Either returning by value or via an sret // reference. if (resultVal) { - Type res0Ty = structTy.getMember(offset); - auto ptrResTy = cudaq::cc::PointerType::get(res0Ty); + // Static values. std::vector are necessarily sret, see below. auto resPtr = builder.create( loc, ptrResTy, temp, ArrayRef{0, offset}); Type castToTy = cudaq::cc::PointerType::get(hostFuncTy.getResult(0)); @@ -1398,22 +1463,22 @@ class GenerateKernelExecution }(); results.push_back(builder.create(loc, castResPtr)); } else { - // Check if device is returning a span. If it is, then we will need to - // convert it to a std::vector here. The vector is constructed in-place - // on the sret memory block. + // This is an sret return. Check if device is returning a span. If it + // is, then we will need to convert it to a std::vector here. The vector + // is constructed in-place on the sret memory block. Value arg0 = hostFuncEntryBlock->getArguments().front(); if (auto spanTy = dyn_cast(devFuncTy.getResult(0))) { auto eleTy = spanTy.getElementType(); auto ptrTy = cudaq::cc::PointerType::get(eleTy); auto gep0 = builder.create( - loc, cudaq::cc::PointerType::get(ptrTy), temp, - SmallVector{0, offset, 0}); + loc, cudaq::cc::PointerType::get(ptrTy), launchResult, + SmallVector{0}); auto dataPtr = builder.create(loc, gep0); auto lenPtrTy = cudaq::cc::PointerType::get(i64Ty); auto gep1 = builder.create( - loc, lenPtrTy, temp, - SmallVector{0, offset, 1}); + loc, lenPtrTy, launchResult, + SmallVector{1}); auto vecLen = builder.create(loc, gep1); if (spanTy.getElementType() == builder.getI1Type()) { genStdvecBoolFromInitList(loc, builder, arg0, dataPtr, vecLen); @@ -1422,13 +1487,14 @@ class GenerateKernelExecution builder.create(loc, i64Ty, eleTy); genStdvecTFromInitList(loc, builder, arg0, dataPtr, tSize, vecLen); } + // free(nullptr) is defined to be a nop in the standard. + builder.create(loc, std::nullopt, "free", + ArrayRef{launchResultToFree}); } else { // Otherwise, we can just copy the aggregate into the sret memory // block. Uses the size of the host function's sret pointer element // type for the memcpy, so the device should return an (aggregate) // value of suitable size. - Type res0Ty = structTy.getMember(offset); - auto ptrResTy = cudaq::cc::PointerType::get(res0Ty); auto resPtr = builder.create( loc, ptrResTy, temp, ArrayRef{0, offset}); diff --git a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp index 9328b78896..689be49998 100644 --- a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp +++ b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp @@ -72,19 +72,17 @@ LogicalResult genVectorOfConstantsFromAttributes(cudaq::IRBuilder irBuilder, return success(); } } - } else if (auto floatTy = dyn_cast(eleTy)) { - if (floatTy == irBuilder.getF64Type()) { - auto vals = readConstantValues(values, floatTy); - if (vals.size() == values.size()) { - irBuilder.genVectorOfConstants(loc, module, name, vals); - return success(); - } - } else if (floatTy == irBuilder.getF32Type()) { - auto vals = readConstantValues(values, floatTy); - if (vals.size() == values.size()) { - irBuilder.genVectorOfConstants(loc, module, name, vals); - return success(); - } + } else if (eleTy == irBuilder.getF64Type()) { + auto vals = readConstantValues(values, eleTy); + if (vals.size() == values.size()) { + irBuilder.genVectorOfConstants(loc, module, name, vals); + return success(); + } + } else if (eleTy == irBuilder.getF32Type()) { + auto vals = readConstantValues(values, eleTy); + if (vals.size() == values.size()) { + irBuilder.genVectorOfConstants(loc, module, name, vals); + return success(); } } return failure(); @@ -147,7 +145,9 @@ class AllocaPattern : public OpRewritePattern { rewriter.create(loc, arrTy, valuesAttr); } - SmallVector toErase; + assert(conArr && "must have created the constant array"); + LLVM_DEBUG(llvm::dbgs() << "constant array is:\n" << conArr << '\n'); + bool cannotEraseAlloc = false; // Rewalk all the uses of alloc, u, which must be cc.cast or cc.compute_ptr. // For each,u, remove a store and replace a load with a cc.extract_value. @@ -176,8 +176,9 @@ class AllocaPattern : public OpRewritePattern { continue; } if (isa(useuser)) - toErase.push_back(useuser); - isLive = true; + rewriter.eraseOp(useuser); + LLVM_DEBUG(llvm::dbgs() << "alloc is live\n"); + cannotEraseAlloc = isLive = true; } if (auto ist = dyn_cast(user)) { rewriter.setInsertionPointAfter(user); @@ -188,20 +189,20 @@ class AllocaPattern : public OpRewritePattern { continue; } if (!isLive) - toErase.push_back(user); - } - if (toGlobal) { - if (conGlobal) { - rewriter.setInsertionPointAfter(alloc); - rewriter.replaceOp(alloc, conGlobal); - } - } else { - toErase.push_back(alloc); + rewriter.eraseOp(user); } - for (auto *op : toErase) - rewriter.eraseOp(op); - + if (toGlobal && conGlobal) { + rewriter.setInsertionPointAfter(alloc); + rewriter.replaceOp(alloc, conGlobal); + return success(); + } + if (cannotEraseAlloc) { + rewriter.setInsertionPointAfter(alloc); + rewriter.create(loc, conArr, alloc); + return success(); + } + rewriter.eraseOp(alloc); return success(); } @@ -305,12 +306,16 @@ class AllocaPattern : public OpRewritePattern { } // Process casts that are used in quake.init_state. if (cast.getType() == ptrUnsizedArrTy) { - if (getWriteOp(cast, 0)) - LLVM_DEBUG( - llvm::dbgs() - << "unexpected use of array size removing cast in a store" - << *op << '\n'); - continue; + if (cast->hasOneUse()) { + auto &use = *cast->getUses().begin(); + Operation *u = use.getOwner(); + if (isa_and_present(u)) { + toGlobalUses.push_back(op); + toGlobal = true; + continue; + } + } + return false; } LLVM_DEBUG(llvm::dbgs() << "unexpected cast: " << *op << '\n'); toGlobalUses.push_back(op); diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp index b91627de9f..5a197f97a6 100644 --- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp +++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp @@ -254,7 +254,7 @@ pyAltLaunchKernelBase(const std::string &name, MlirModule module, if (!thunkPtr) throw std::runtime_error("cudaq::builder failed to get thunk function"); - auto thunk = reinterpret_cast(*thunkPtr); + auto thunk = reinterpret_cast(*thunkPtr); std::string properName = name; @@ -327,15 +327,21 @@ pyAltLaunchKernelBase(const std::string &name, MlirModule module, if (launch) { auto &platform = cudaq::get_platform(); + auto uReturnOffset = static_cast(returnOffset); if (platform.is_remote() || platform.is_emulated()) { auto *wrapper = new cudaq::ArgWrapper{mod, names, rawArgs}; - cudaq::altLaunchKernel(name.c_str(), thunk, - reinterpret_cast(wrapper), size, - (uint64_t)returnOffset); + auto dynamicResult = cudaq::altLaunchKernel( + name.c_str(), thunk, reinterpret_cast(wrapper), size, + uReturnOffset); + if (dynamicResult.data_buffer || dynamicResult.size) + throw std::runtime_error("not implemented: support dynamic results"); delete wrapper; - } else - cudaq::altLaunchKernel(name.c_str(), thunk, rawArgs, size, - (uint64_t)returnOffset); + } else { + auto dynamicResult = cudaq::altLaunchKernel(name.c_str(), thunk, rawArgs, + size, uReturnOffset); + if (dynamicResult.data_buffer || dynamicResult.size) + throw std::runtime_error("not implemented: support dynamic results"); + } } return std::make_tuple(rawArgs, size, returnOffset); diff --git a/python/runtime/utils/PyRemoteSimulatorQPU.cpp b/python/runtime/utils/PyRemoteSimulatorQPU.cpp index 4cc998c363..f767bb652a 100644 --- a/python/runtime/utils/PyRemoteSimulatorQPU.cpp +++ b/python/runtime/utils/PyRemoteSimulatorQPU.cpp @@ -130,16 +130,19 @@ class PyRemoteSimulatorQPU : public cudaq::BaseRemoteSimulatorQPU { kernelArgs, gradient, H, optimizer, n_params, shots); } - void launchKernel(const std::string &name, void (*kernelFunc)(void *), - void *args, std::uint64_t voidStarSize, - std::uint64_t resultOffset, - const std::vector &rawArgs) override { + cudaq::KernelThunkResultType + launchKernel(const std::string &name, cudaq::KernelThunkType kernelFunc, + void *args, std::uint64_t voidStarSize, + std::uint64_t resultOffset, + const std::vector &rawArgs) override { cudaq::info("PyRemoteSimulatorQPU: Launch kernel named '{}' remote QPU {} " "(simulator = {})", name, qpu_id, m_simName); ::launchKernelImpl(getExecutionContextForMyThread(), m_client, m_simName, - name, kernelFunc, args, voidStarSize, resultOffset, - rawArgs); + name, make_degenerate_kernel_type(kernelFunc), args, + voidStarSize, resultOffset, rawArgs); + // TODO: Python should probably support return values too. + return {}; } void launchKernel(const std::string &name, @@ -178,16 +181,19 @@ class PyNvcfSimulatorQPU : public cudaq::BaseNvcfSimulatorQPU { kernelArgs, gradient, H, optimizer, n_params, shots); } - void launchKernel(const std::string &name, void (*kernelFunc)(void *), - void *args, std::uint64_t voidStarSize, - std::uint64_t resultOffset, - const std::vector &rawArgs) override { + cudaq::KernelThunkResultType + launchKernel(const std::string &name, cudaq::KernelThunkType kernelFunc, + void *args, std::uint64_t voidStarSize, + std::uint64_t resultOffset, + const std::vector &rawArgs) override { cudaq::info("PyNvcfSimulatorQPU: Launch kernel named '{}' remote QPU {} " "(simulator = {})", name, qpu_id, m_simName); ::launchKernelImpl(getExecutionContextForMyThread(), m_client, m_simName, - name, kernelFunc, args, voidStarSize, resultOffset, - rawArgs); + name, make_degenerate_kernel_type(kernelFunc), args, + voidStarSize, resultOffset, rawArgs); + // TODO: Python should probably support return values too. + return {}; } void launchKernel(const std::string &name, diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h index 61c26dc791..c65e94c3b6 100644 --- a/runtime/common/BaseRemoteRESTQPU.h +++ b/runtime/common/BaseRemoteRESTQPU.h @@ -578,10 +578,11 @@ class BaseRemoteRESTQPU : public cudaq::QPU { /// the representation required by the targeted backend. Handle all pertinent /// modifications for the execution context as well as asynchronous or /// synchronous invocation. - void launchKernel(const std::string &kernelName, void (*kernelFunc)(void *), - void *args, std::uint64_t voidStarSize, - std::uint64_t resultOffset, - const std::vector &rawArgs) override { + KernelThunkResultType + launchKernel(const std::string &kernelName, KernelThunkType kernelFunc, + void *args, std::uint64_t voidStarSize, + std::uint64_t resultOffset, + const std::vector &rawArgs) override { cudaq::info("launching remote rest kernel ({})", kernelName); // TODO future iterations of this should support non-void return types. @@ -597,6 +598,9 @@ class BaseRemoteRESTQPU : public cudaq::QPU { auto codes = rawArgs.empty() ? lowerQuakeCode(kernelName, args) : lowerQuakeCode(kernelName, rawArgs); completeLaunchKernel(kernelName, std::move(codes)); + + // NB: Kernel should/will never return dynamic results. + return {}; } void completeLaunchKernel(const std::string &kernelName, diff --git a/runtime/common/BaseRemoteSimulatorQPU.h b/runtime/common/BaseRemoteSimulatorQPU.h index 667fba5941..6260f334c1 100644 --- a/runtime/common/BaseRemoteSimulatorQPU.h +++ b/runtime/common/BaseRemoteSimulatorQPU.h @@ -107,22 +107,24 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU { void launchKernel(const std::string &name, const std::vector &rawArgs) override { - launchKernelImpl(name, nullptr, nullptr, 0, 0, &rawArgs); + [[maybe_unused]] auto dynamicResult = + launchKernelImpl(name, nullptr, nullptr, 0, 0, &rawArgs); } - void launchKernel(const std::string &name, void (*kernelFunc)(void *), - void *args, std::uint64_t voidStarSize, - std::uint64_t resultOffset, - const std::vector &rawArgs) override { + KernelThunkResultType + launchKernel(const std::string &name, KernelThunkType kernelFunc, void *args, + std::uint64_t voidStarSize, std::uint64_t resultOffset, + const std::vector &rawArgs) override { // Remote simulation cannot deal with rawArgs. Drop them on the floor. - launchKernelImpl(name, kernelFunc, args, voidStarSize, resultOffset, - nullptr); + return launchKernelImpl(name, kernelFunc, args, voidStarSize, resultOffset, + nullptr); } - void launchKernelImpl(const std::string &name, void (*kernelFunc)(void *), - void *args, std::uint64_t voidStarSize, - std::uint64_t resultOffset, - const std::vector *rawArgs) { + [[nodiscard]] KernelThunkResultType + launchKernelImpl(const std::string &name, KernelThunkType kernelFunc, + void *args, std::uint64_t voidStarSize, + std::uint64_t resultOffset, + const std::vector *rawArgs) { cudaq::info( "BaseRemoteSimulatorQPU: Launch kernel named '{}' remote QPU {} " "(simulator = {})", @@ -132,7 +134,7 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU { getExecutionContextForMyThread(); if (executionContextPtr && executionContextPtr->name == "tracer") { - return; + return {}; } // Default context for a 'fire-and-ignore' kernel launch; i.e., no context @@ -155,7 +157,8 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU { const bool requestOkay = m_client->sendRequest( *m_mlirContext, executionContext, /*serializedCodeContext=*/nullptr, /*vqe_gradient=*/nullptr, /*vqe_optimizer=*/nullptr, /*vqe_n_params=*/0, - m_simName, name, kernelFunc, args, voidStarSize, &errorMsg, rawArgs); + m_simName, name, make_degenerate_kernel_type(kernelFunc), args, + voidStarSize, &errorMsg, rawArgs); if (!requestOkay) throw std::runtime_error("Failed to launch kernel. Error: " + errorMsg); if (isDirectInvocation && @@ -182,6 +185,9 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU { executionContext.invocationResultBuffer.size()); executionContext.invocationResultBuffer.clear(); } + + // Assumes kernel has no dynamic results. (Static result handled above.) + return {}; } void diff --git a/runtime/common/KernelWrapper.h b/runtime/common/KernelWrapper.h index 6c30efa58d..2e82522d91 100644 --- a/runtime/common/KernelWrapper.h +++ b/runtime/common/KernelWrapper.h @@ -537,17 +537,19 @@ std::invoke_result_t invokeKernel(QuantumKernel &&fn, // For raw function pointers, i.e., kernels described as free functions, we // send on the function pointer to the platform to retrieve the symbol name // since the typeid of a function only contains signature info. - if constexpr (std::is_class_v>) + if constexpr (std::is_class_v>) { // FIXME: this shouldn't use the serialization code any longer. It should // build a vector of void* and pass that instead. cudaq::get_platform().launchKernel(cudaq::getKernelName(fn), nullptr, (void *)serializedArgsBuffer.data(), serializedArgsBuffer.size(), 0, {}); - else + } else { cudaq::get_platform().launchKernel( - cudaq::getKernelName(fn), reinterpret_cast(&fn), + cudaq::getKernelName(fn), + reinterpret_cast(&fn), (void *)serializedArgsBuffer.data(), serializedArgsBuffer.size(), 0, {}); + } } #else return fn(std::forward(args)...); diff --git a/runtime/common/ThunkInterface.h b/runtime/common/ThunkInterface.h new file mode 100644 index 0000000000..05aeec37a3 --- /dev/null +++ b/runtime/common/ThunkInterface.h @@ -0,0 +1,44 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include + +namespace cudaq { + +/// A kernel may return results dynamically if the size of the result is not a +/// constant at compile-time. +struct KernelThunkResultType { + void *data_buffer; ///< Pointer to the first element of an array. + std::uint64_t size; ///< The size of the buffer in bytes. +}; + +/// The universal signature of a kernel thunk. +using KernelThunkType = KernelThunkResultType (*)(void *, bool); + +/// The degenerate form of a kernel call. In some launch cases, it may be +/// predetermined that the kernel can be called without a thunk. +using KernelDegenerateType = void (*)(void *); + +/// In some cases, the launcher will bypass the thunk function and call a +/// degenerate stub. That means that the extra `bool` argument will be ignored +/// by the called kernel and the kernel will not return a dynamic result. +/// +/// This is a terrible idea, generally speaking. However, if the launcher +/// neither looks for nor attempts to use the second `bool` argument at all, and +/// the launcher will drop any results returned from the kernel (regardless of +/// type) on the floor anyway, then one may be able to get away with using a +/// degenerate kernel type. +inline KernelDegenerateType +make_degenerate_kernel_type(KernelThunkType func_type) { + return reinterpret_cast( + reinterpret_cast(func_type)); +} + +} // namespace cudaq diff --git a/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp b/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp index 1b8d0b1141..df8a89e6f4 100644 --- a/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp +++ b/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp @@ -33,11 +33,12 @@ class DefaultQPU : public cudaq::QPU { execution_queue->enqueue(task); } - void launchKernel(const std::string &name, void (*kernelFunc)(void *), - void *args, std::uint64_t, std::uint64_t, - const std::vector &rawArgs) override { + cudaq::KernelThunkResultType + launchKernel(const std::string &name, cudaq::KernelThunkType kernelFunc, + void *args, std::uint64_t argsSize, std::uint64_t resultOffset, + const std::vector &rawArgs) override { ScopedTraceWithContext(cudaq::TIMING_LAUNCH, "QPU::launchKernel"); - kernelFunc(args); + return kernelFunc(args, /*isRemote=*/false); } /// Overrides setExecutionContext to forward it to the ExecutionManager diff --git a/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp b/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp index 1243e9f480..38b26f2a98 100644 --- a/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp +++ b/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp @@ -37,12 +37,13 @@ class GPUEmulatedQPU : public cudaq::QPU { execution_queue->enqueue(task); } - void launchKernel(const std::string &name, void (*kernelFunc)(void *), - void *args, std::uint64_t, std::uint64_t, - const std::vector &rawArgs) override { + cudaq::KernelThunkResultType + launchKernel(const std::string &name, cudaq::KernelThunkType kernelFunc, + void *args, std::uint64_t, std::uint64_t, + const std::vector &rawArgs) override { cudaq::info("QPU::launchKernel GPU {}", qpu_id); cudaSetDevice(qpu_id); - kernelFunc(args); + return kernelFunc(args, /*differentMemorySpace=*/false); } /// Overrides setExecutionContext to forward it to the ExecutionManager diff --git a/runtime/cudaq/platform/orca/OrcaQPU.cpp b/runtime/cudaq/platform/orca/OrcaQPU.cpp index 63883a7af3..fdb053bf87 100644 --- a/runtime/cudaq/platform/orca/OrcaQPU.cpp +++ b/runtime/cudaq/platform/orca/OrcaQPU.cpp @@ -23,8 +23,8 @@ cudaq::sample_result runSampling(TBIParameters ¶meters, platform.set_exec_ctx(ctx.get(), qpu_id); platform.set_current_qpu(qpu_id); - cudaq::altLaunchKernel("orca_launch", nullptr, ¶meters, - sizeof(TBIParameters), 0); + [[maybe_unused]] auto dynamicResult = cudaq::altLaunchKernel( + "orca_launch", nullptr, ¶meters, sizeof(TBIParameters), 0); platform.reset_exec_ctx(qpu_id); return ctx->result; @@ -43,8 +43,8 @@ async_sample_result runAsyncSampling(TBIParameters ¶meters, platform.set_exec_ctx(ctx.get(), qpu_id); platform.set_current_qpu(qpu_id); - cudaq::altLaunchKernel("orca_launch", nullptr, ¶meters, - sizeof(TBIParameters), 0); + [[maybe_unused]] auto dynamicResult = cudaq::altLaunchKernel( + "orca_launch", nullptr, ¶meters, sizeof(TBIParameters), 0); // If we have a non-null future, set it futureResult = ctx->futureResult; diff --git a/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.cpp b/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.cpp index f834136fc4..1c63c92c2b 100644 --- a/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.cpp +++ b/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.cpp @@ -59,11 +59,10 @@ void OrcaRemoteRESTQPU::setTargetBackend(const std::string &backend) { } /// @brief Launch the experiment. -void OrcaRemoteRESTQPU::launchKernel(const std::string &kernelName, - void (*kernelFunc)(void *), void *args, - std::uint64_t voidStarSize, - std::uint64_t resultOffset, - const std::vector &rawArgs) { +KernelThunkResultType OrcaRemoteRESTQPU::launchKernel( + const std::string &kernelName, KernelThunkType kernelFunc, void *args, + std::uint64_t voidStarSize, std::uint64_t resultOffset, + const std::vector &rawArgs) { cudaq::info("OrcaRemoteRESTQPU: Launch kernel named '{}' remote QPU {}", kernelName, qpu_id); @@ -88,12 +87,15 @@ void OrcaRemoteRESTQPU::launchKernel(const std::string &kernelName, // Keep this asynchronous if requested if (ctx->asyncExec) { ctx->futureResult = future; - return; + return {}; } // Otherwise make this synchronous ctx->result = future.get(); + + // TODO: support dynamic result types. + return {}; } } // namespace cudaq -CUDAQ_REGISTER_TYPE(cudaq::QPU, cudaq::OrcaRemoteRESTQPU, orca) \ No newline at end of file +CUDAQ_REGISTER_TYPE(cudaq::QPU, cudaq::OrcaRemoteRESTQPU, orca) diff --git a/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.h b/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.h index 80d2df5726..d14a5f4e3c 100644 --- a/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.h +++ b/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.h @@ -120,10 +120,11 @@ class OrcaRemoteRESTQPU : public cudaq::QPU { /// @brief Launch the kernel. Handle all pertinent modifications for the /// execution context. - void launchKernel(const std::string &kernelName, void (*kernelFunc)(void *), - void *args, std::uint64_t voidStarSize, - std::uint64_t resultOffset, - const std::vector &rawArgs) override; + KernelThunkResultType + launchKernel(const std::string &kernelName, KernelThunkType kernelFunc, + void *args, std::uint64_t voidStarSize, + std::uint64_t resultOffset, + const std::vector &rawArgs) override; void launchKernel(const std::string &kernelName, const std::vector &rawArgs) override { throw std::runtime_error("launch kernel on raw args not implemented"); diff --git a/runtime/cudaq/platform/qpu.h b/runtime/cudaq/platform/qpu.h index 13a6d7da25..d104094a1e 100644 --- a/runtime/cudaq/platform/qpu.h +++ b/runtime/cudaq/platform/qpu.h @@ -11,12 +11,12 @@ #include "QuantumExecutionQueue.h" #include "common/Logger.h" #include "common/Registry.h" +#include "common/ThunkInterface.h" #include "common/Timing.h" #include "cudaq/qis/execution_manager.h" #include "cudaq/qis/qubit_qis.h" #include "cudaq/remote_capabilities.h" #include "cudaq/utils/cudaq_utils.h" - #include namespace cudaq { @@ -172,9 +172,10 @@ class QPU : public registry::RegisteredType { /// Launch the kernel with given name (to extract its Quake representation). /// The raw function pointer is also provided, as are the runtime arguments, /// as a struct-packed void pointer and its corresponding size. - virtual void launchKernel(const std::string &name, void (*kernelFunc)(void *), - void *args, std::uint64_t, std::uint64_t, - const std::vector &rawArgs) = 0; + [[nodiscard]] virtual KernelThunkResultType + launchKernel(const std::string &name, KernelThunkType kernelFunc, void *args, + std::uint64_t, std::uint64_t, + const std::vector &rawArgs) = 0; /// Launch the kernel with given name and argument arrays. // This is intended for remote QPUs whereby we need to JIT-compile the kernel diff --git a/runtime/cudaq/platform/quantum_platform.cpp b/runtime/cudaq/platform/quantum_platform.cpp index 00e259c389..46f248c690 100644 --- a/runtime/cudaq/platform/quantum_platform.cpp +++ b/runtime/cudaq/platform/quantum_platform.cpp @@ -30,11 +30,11 @@ namespace cudaq { std::string get_quake(const std::string &); static quantum_platform *platform; -inline static constexpr std::string_view GetQuantumPlatformSymbol = +static constexpr std::string_view GetQuantumPlatformSymbol = "getQuantumPlatform"; void setQuantumPlatformInternal(quantum_platform *p) { - cudaq::info("external caller setting the platform."); + info("external caller setting the platform."); platform = p; } @@ -43,8 +43,8 @@ void setQuantumPlatformInternal(quantum_platform *p) { quantum_platform *getQuantumPlatformInternal() { if (platform) return platform; - platform = cudaq::getUniquePluginInstance( - GetQuantumPlatformSymbol); + platform = + getUniquePluginInstance(GetQuantumPlatformSymbol); return platform; } @@ -94,8 +94,7 @@ std::size_t quantum_platform::get_current_qpu() { return platformCurrentQPU; } // Specify the execution context for this platform. // This delegates to the targeted QPU -void quantum_platform::set_exec_ctx(cudaq::ExecutionContext *ctx, - std::size_t qid) { +void quantum_platform::set_exec_ctx(ExecutionContext *ctx, std::size_t qid) { executionContext = ctx; auto &platformQPU = platformQPUs[qid]; platformQPU->setExecutionContext(ctx); @@ -130,9 +129,8 @@ bool quantum_platform::supports_conditional_feedback( } void quantum_platform::launchVQE(const std::string kernelName, - const void *kernelArgs, - cudaq::gradient *gradient, cudaq::spin_op H, - cudaq::optimizer &optimizer, + const void *kernelArgs, gradient *gradient, + spin_op H, optimizer &optimizer, const int n_params, const std::size_t shots) { std::size_t qpu_id = 0; @@ -151,11 +149,10 @@ quantum_platform::get_remote_capabilities(const std::size_t qpu_id) const { return platformQPUs[qpu_id]->getRemoteCapabilities(); } -void quantum_platform::launchKernel(std::string kernelName, - void (*kernelFunc)(void *), void *args, - std::uint64_t voidStarSize, - std::uint64_t resultOffset, - const std::vector &rawArgs) { +KernelThunkResultType quantum_platform::launchKernel( + std::string kernelName, KernelThunkType kernelFunc, void *args, + std::uint64_t voidStarSize, std::uint64_t resultOffset, + const std::vector &rawArgs) { std::size_t qpu_id = 0; auto tid = std::hash{}(std::this_thread::get_id()); @@ -164,8 +161,8 @@ void quantum_platform::launchKernel(std::string kernelName, qpu_id = iter->second; auto &qpu = platformQPUs[qpu_id]; - qpu->launchKernel(kernelName, kernelFunc, args, voidStarSize, resultOffset, - rawArgs); + return qpu->launchKernel(kernelName, kernelFunc, args, voidStarSize, + resultOffset, rawArgs); } void quantum_platform::launchKernel(std::string kernelName, @@ -183,7 +180,7 @@ void quantum_platform::launchKernel(std::string kernelName, void quantum_platform::launchSerializedCodeExecution( const std::string &name, - cudaq::SerializedCodeExecutionContext &serializeCodeExecutionObject) { + SerializedCodeExecutionContext &serializeCodeExecutionObject) { std::size_t qpu_id = 0; auto tid = std::hash{}(std::this_thread::get_id()); @@ -208,37 +205,46 @@ std::ostream *quantum_platform::getLogStream() { return platformLogStream; } void quantum_platform::setLogStream(std::ostream &logStream) { platformLogStream = &logStream; } -} // namespace cudaq -void cudaq::altLaunchKernel(const char *kernelName, void (*kernelFunc)(void *), - void *kernelArgs, std::uint64_t argsSize, - std::uint64_t resultOffset) { +KernelThunkResultType altLaunchKernel(const char *kernelName, + KernelThunkType kernelFunc, + void *kernelArgs, std::uint64_t argsSize, + std::uint64_t resultOffset) { ScopedTraceWithContext("altLaunchKernel", kernelName, argsSize); - auto &platform = *cudaq::getQuantumPlatformInternal(); + auto &platform = *getQuantumPlatformInternal(); std::string kernName = kernelName; - platform.launchKernel(kernName, kernelFunc, kernelArgs, argsSize, - resultOffset, {}); + return platform.launchKernel(kernName, kernelFunc, kernelArgs, argsSize, + resultOffset, {}); } -void cudaq::streamlinedLaunchKernel(const char *kernelName, - const std::vector &rawArgs) { +KernelThunkResultType +streamlinedLaunchKernel(const char *kernelName, + const std::vector &rawArgs) { std::size_t argsSize = rawArgs.size(); ScopedTraceWithContext("streamlinedLaunchKernel", kernelName, argsSize); - auto &platform = *cudaq::getQuantumPlatformInternal(); + auto &platform = *getQuantumPlatformInternal(); std::string kernName = kernelName; platform.launchKernel(kernName, rawArgs); + // NB: The streamlined launch will never return results. Use alt or hybrid if + // the kernel returns results. + return {}; } -void cudaq::hybridLaunchKernel(const char *kernelName, void (*kernel)(void *), - void *args, std::uint64_t argsSize, - std::uint64_t resultOffset, - const std::vector &rawArgs) { +KernelThunkResultType hybridLaunchKernel(const char *kernelName, + KernelThunkType kernel, void *args, + std::uint64_t argsSize, + std::uint64_t resultOffset, + const std::vector &rawArgs) { ScopedTraceWithContext("hybridLaunchKernel", kernelName); - auto &platform = *cudaq::getQuantumPlatformInternal(); + auto &platform = *getQuantumPlatformInternal(); const std::string kernName = kernelName; - if (platform.is_remote(platform.get_current_qpu())) + if (platform.is_remote(platform.get_current_qpu())) { + // This path should never call a kernel that returns results. platform.launchKernel(kernName, rawArgs); - else - platform.launchKernel(kernName, kernel, args, argsSize, resultOffset, - rawArgs); + return {}; + } + return platform.launchKernel(kernName, kernel, args, argsSize, resultOffset, + rawArgs); } + +} // namespace cudaq diff --git a/runtime/cudaq/platform/quantum_platform.h b/runtime/cudaq/platform/quantum_platform.h index e9598bf051..e16071890a 100644 --- a/runtime/cudaq/platform/quantum_platform.h +++ b/runtime/cudaq/platform/quantum_platform.h @@ -11,6 +11,7 @@ #include "common/ExecutionContext.h" #include "common/NoiseModel.h" #include "common/ObserveResult.h" +#include "common/ThunkInterface.h" #include "cudaq/remote_capabilities.h" #include "cudaq/utils/cudaq_utils.h" #include @@ -142,10 +143,10 @@ class quantum_platform { // This method is the hook for the kernel rewrites to invoke // quantum kernels. - void launchKernel(std::string kernelName, void (*kernelFunc)(void *), - void *args, std::uint64_t voidStarSize, - std::uint64_t resultOffset, - const std::vector &rawArgs); + [[nodiscard]] KernelThunkResultType + launchKernel(std::string kernelName, KernelThunkType kernelFunc, void *args, + std::uint64_t voidStarSize, std::uint64_t resultOffset, + const std::vector &rawArgs); void launchKernel(std::string kernelName, const std::vector &); // This method is the hook for executing SerializedCodeExecutionContext @@ -215,19 +216,21 @@ class quantum_platform { /// provide that information. extern "C" { // Client-server (legacy) interface. -void altLaunchKernel(const char *kernelName, void (*kernel)(void *), void *args, - std::uint64_t argsSize, std::uint64_t resultOffset); +[[nodiscard]] KernelThunkResultType +altLaunchKernel(const char *kernelName, KernelThunkType kernel, void *args, + std::uint64_t argsSize, std::uint64_t resultOffset); // Streamlined interface for launching kernels. Argument synthesis and JIT // compilation *must* happen on the local machine. -void streamlinedLaunchKernel(const char *kernelName, - const std::vector &rawArgs); +[[nodiscard]] KernelThunkResultType +streamlinedLaunchKernel(const char *kernelName, + const std::vector &rawArgs); // Hybrid of the client-server and streamlined approaches. Letting JIT // compilation happen either early or late and can handle return values from // each kernel launch. -void hybridLaunchKernel(const char *kernelName, void (*kernel)(void *), - void *args, std::uint64_t argsSize, - std::uint64_t resultOffset, - const std::vector &rawArgs); +[[nodiscard]] KernelThunkResultType +hybridLaunchKernel(const char *kernelName, KernelThunkType kernel, void *args, + std::uint64_t argsSize, std::uint64_t resultOffset, + const std::vector &rawArgs); } } // namespace cudaq diff --git a/runtime/cudaq/qis/remote_state.cpp b/runtime/cudaq/qis/remote_state.cpp index 713a462e46..97f907027a 100644 --- a/runtime/cudaq/qis/remote_state.cpp +++ b/runtime/cudaq/qis/remote_state.cpp @@ -184,7 +184,8 @@ RemoteSimulationState::overlap(const cudaq::SimulationState &other) { std::make_pair(static_cast(this), static_cast(&otherState)); platform.set_exec_ctx(&context); - platform.launchKernel(kernelName, nullptr, nullptr, 0, 0, {}); + [[maybe_unused]] auto dynamicResult = + platform.launchKernel(kernelName, nullptr, nullptr, 0, 0, {}); platform.reset_exec_ctx(); assert(context.overlapResult.has_value()); return context.overlapResult.value(); diff --git a/targettests/execution/vector_result.cpp b/targettests/execution/vector_result.cpp new file mode 100644 index 0000000000..ecc09fce5c --- /dev/null +++ b/targettests/execution/vector_result.cpp @@ -0,0 +1,52 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t + +#include "cudaq.h" +#include + +struct VectorBoolResult { + std::vector operator()() __qpu__ { + std::vector result(3); + result[0] = true; + result[1] = false; + result[2] = true; + return result; + } +}; + +struct VectorIntResult { + std::vector operator()() __qpu__ { + std::vector result(2); + result[0] = 42; + result[1] = -23479; + return result; + } +}; + +struct VectorDoubleResult { + std::vector operator()() __qpu__ { + std::vector result(2); + result[0] = 543.0; + result[1] = -234234.0; + return result; + } +}; + +int main() { + auto retb{VectorBoolResult{}()}; + printf("%d %d %d\n", static_cast(retb[0]), static_cast(retb[1]), + static_cast(retb[2])); + auto ret = VectorIntResult{}(); + printf("%d %d\n", ret[0], ret[1]); + std::vector retd{VectorDoubleResult{}()}; + printf("%f %f\n", retd[0], retd[1]); + return !(retb[0] && !retb[1] && retb[2] && ret[0] == 42 && ret[1] == -23479 && + retd[0] == 543.0 && retd[1] == -234234.0); +} diff --git a/test/Quake-QIR/argument.qke b/test/Quake-QIR/argument.qke index 61d737d5ce..6a3532805a 100644 --- a/test/Quake-QIR/argument.qke +++ b/test/Quake-QIR/argument.qke @@ -55,7 +55,7 @@ func.func @test_0(%0: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, !cc.ptr, !cc.ptr} // CHECK: %[[VAL_4:.*]] = bitcast { i32, { i1*, i64 } }* %[[VAL_3]] to i8* // CHECK: %[[VAL_5:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 0 // CHECK: store i32 %[[VAL_2]], i32* %[[VAL_5]], align 8 -// CHECK: call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_0.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_0.thunk to i8*), i8* nonnull %[[VAL_4]], i64 24, i64 8) -// CHECK: %[[VAL_6:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1, i32 0 -// CHECK: %[[VAL_7:.*]] = bitcast i1** %[[VAL_6]] to i8** -// CHECK: %[[VAL_8:.*]] = load i8*, i8** %[[VAL_7]], align 8 -// CHECK: %[[VAL_9:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1, i32 1 -// CHECK: %[[VAL_10:.*]] = load i64, i64* %[[VAL_9]], align 8 -// CHECK: %[[VAL_11:.*]] = bitcast { i8*, i8*, i8* }* %[[VAL_0]] to i8* -// CHECK: call void @__nvqpp_initializer_list_to_vector_bool(i8* %[[VAL_11]], i8* %[[VAL_8]], i64 %[[VAL_10]]) +// CHECK: %[[VAL_6:.*]] = call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_0.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_0.thunk to i8*), i8* nonnull %[[VAL_4]], i64 24, i64 8) +// CHECK: %[[VAL_7:.*]] = extractvalue { i8*, i64 } %[[VAL_6]], 0 +// CHECK: %[[VAL_8:.*]] = icmp eq i8* %[[VAL_7]], null +// CHECK: %[[VAL_9:.*]] = getelementptr i8, i8* %[[VAL_7]], i64 8 +// CHECK: %[[VAL_10:.*]] = bitcast i8* %[[VAL_9]] to { i1*, i64 }* +// CHECK: %[[VAL_11:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1 +// CHECK: %[[VAL_12:.*]] = select i1 %[[VAL_8]], { i1*, i64 }* %[[VAL_11]], { i1*, i64 }* %[[VAL_10]] +// CHECK: %[[VAL_13:.*]] = bitcast { i1*, i64 }* %[[VAL_12]] to i8** +// CHECK: %[[VAL_14:.*]] = load i8*, i8** %[[VAL_13]], align 8 +// CHECK: %[[VAL_15:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1, i32 1 +// CHECK: %[[VAL_16:.*]] = getelementptr i8, i8* %[[VAL_7]], i64 16 +// CHECK: %[[VAL_17:.*]] = bitcast i8* %[[VAL_16]] to i64* +// CHECK: %[[VAL_18:.*]] = select i1 %[[VAL_8]], i64* %[[VAL_15]], i64* %[[VAL_17]] +// CHECK: %[[VAL_19:.*]] = load i64, i64* %[[VAL_18]], align 4 +// CHECK: %[[VAL_20:.*]] = bitcast { i8*, i8*, i8* }* %[[VAL_0]] to i8* +// CHECK: call void @__nvqpp_initializer_list_to_vector_bool(i8* %[[VAL_20]], i8* %[[VAL_14]], i64 %[[VAL_19]]) +// CHECK: call void @free(i8* %[[VAL_7]]) // CHECK: ret void // CHECK: } @@ -169,7 +178,7 @@ func.func @test_1(%this: !cc.ptr) -> i16 { // CHECK-SAME: %[[VAL_0:.*]]) {{.*}}{ // CHECK-NEXT: %[[VAL_2:.*]] = alloca i16, align 8 // CHECK: %[[VAL_3:.*]] = bitcast i16* %[[VAL_2]] to i8* -// CHECK: call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_1.thunk to i8*), i8* nonnull %[[VAL_3]], i64 2, i64 0) +// CHECK: call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_1.thunk to i8*), i8* nonnull %[[VAL_3]], i64 2, i64 0) // CHECK: %[[VAL_4:.*]] = load i16, i16* %[[VAL_2]], align 8 // CHECK: ret i16 %[[VAL_4]] // CHECK: } @@ -200,7 +209,7 @@ func.func @test_2(%1: !cc.ptr> {llvm.sret = !cc // CHECK-SAME: %[[VAL_0:.*]], i8* nocapture readnone %[[VAL_1:.*]]) {{.*}}{ // CHECK: %[[VAL_2:.*]] = alloca { { i16, float, double, i64 } }, align 8 // CHECK: %[[VAL_3:.*]] = bitcast { { i16, float, double, i64 } }* %[[VAL_2]] to i8* -// CHECK: call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_2.thunk to i8*), i8* nonnull %[[VAL_3]], i64 24, i64 0) +// CHECK: call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_2.thunk to i8*), i8* nonnull %[[VAL_3]], i64 24, i64 0) // CHECK: %[[VAL_4:.*]] = bitcast { i16, float, double, i64 }* %[[VAL_0]] to i8* // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(24) %[[VAL_4]], i8* noundef nonnull align 8 dereferenceable(24) %[[VAL_3]], i64 24, i1 false) // CHECK: ret void @@ -234,7 +243,7 @@ func.func @test_3(%1: !cc.ptr> {llvm.sret = !cc.array> {llvm.sret = !cc.struct // CHECK-SAME: %[[VAL_0:.*]], i8* nocapture readnone %[[VAL_1:.*]]) {{.*}}{ // CHECK: %[[VAL_2:.*]] = alloca { i64, double }, align 8 // CHECK: %[[VAL_3:.*]] = bitcast { i64, double }* %[[VAL_2]] to i8* -// CHECK: call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_4.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_4.thunk to i8*), i8* nonnull %[[VAL_3]], i64 16, i64 0) +// CHECK: call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_4.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_4.thunk to i8*), i8* nonnull %[[VAL_3]], i64 16, i64 0) // CHECK: %[[VAL_4:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8* // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_4]], i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_3]], i64 16, i1 false) // CHECK: ret void @@ -283,7 +292,7 @@ func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct // CHECK-SAME: %[[VAL_0:.*]]) {{.*}}{ // CHECK: %[[VAL_1:.*]] = alloca { i64, double }, align 8 // CHECK: %[[VAL_2:.*]] = bitcast { i64, double }* %[[VAL_1]] to i8* -// CHECK: call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_5.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_5.thunk to i8*), i8* nonnull %[[VAL_2]], i64 16, i64 0) +// CHECK: call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_5.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_5.thunk to i8*), i8* nonnull %[[VAL_2]], i64 16, i64 0) // CHECK: %[[VAL_3:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8* // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_3]], i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_2]], i64 16, i1 false) // CHECK: ret void diff --git a/test/Quake/kernel_exec-1.qke b/test/Quake/kernel_exec-1.qke index 751ba66a10..37ac7c7229 100644 --- a/test/Quake/kernel_exec-1.qke +++ b/test/Quake/kernel_exec-1.qke @@ -90,25 +90,38 @@ module attributes {quake.mangled_name_map = { // CHECK-DAG: %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}> // CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : i64 // CHECK: %[[VAL_4:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{i32, f64}>, i32) -> !cc.struct<{i32, f64}> -// CHECK: %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 -// CHECK: %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_9:.*]] = cc.alloca i8[%[[VAL_8]] : i64] -// CHECK: %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_4]], %[[VAL_10]] : !cc.ptr> -// CHECK: %[[VAL_11:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr x ?>> -// CHECK: %[[VAL_13:.*]] = constant @ghz.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_15:.*]] = cc.func_ptr %[[VAL_13]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_16:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr x ?>>) -> !cc.ptr -// CHECK: %[[VAL_18:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64 -// CHECK: %[[VAL_12:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> -// CHECK: %[[VAL_14:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr>) -> !cc.ptr -// CHECK: call @altLaunchKernel(%[[VAL_14]], %[[VAL_15]], %[[VAL_16]], %[[VAL_8]], %[[VAL_18]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> () -// CHECK: %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_11]][0, 1] : (!cc.ptr x ?>>) -> !cc.ptr -// CHECK: %[[VAL_20:.*]] = cc.load %[[VAL_19]] : !cc.ptr -// CHECK: return %[[VAL_20]] : f64 +// CHECK: %[[VAL_5:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 +// CHECK: %[[VAL_6:.*]] = arith.addi %[[VAL_5]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_7:.*]] = cc.alloca i8{{\[}}%[[VAL_6]] : i64] +// CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_4]], %[[VAL_8]] : !cc.ptr> +// CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr>) -> !cc.ptr x ?>> +// CHECK: %[[VAL_10:.*]] = constant @ghz.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_11:.*]] = cc.func_ptr %[[VAL_10]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr x ?>>) -> !cc.ptr +// CHECK: %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64 +// CHECK: %[[VAL_14:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> +// CHECK: %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!llvm.ptr>) -> !cc.ptr +// CHECK: %[[VAL_16:.*]] = call @altLaunchKernel(%[[VAL_15]], %[[VAL_11]], %[[VAL_12]], %[[VAL_6]], %[[VAL_13]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_17:.*]] = cc.extract_value %[[VAL_16]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_19:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_20:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_19]] : i64 +// CHECK: cf.cond_br %[[VAL_20]], ^bb1, ^bb2 +// CHECK: ^bb1: +// CHECK: %[[VAL_21:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_21]][1] : (!cc.ptr>) -> !cc.ptr +// CHECK: cf.br ^bb3(%[[VAL_22]] : !cc.ptr) +// CHECK: ^bb2: +// CHECK: %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr x ?>>) -> !cc.ptr +// CHECK: cf.br ^bb3(%[[VAL_23]] : !cc.ptr) +// CHECK: ^bb3(%[[VAL_24:.*]]: !cc.ptr): +// CHECK: %[[VAL_25:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr x ?>>) -> !cc.ptr +// CHECK: %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr +// CHECK: return %[[VAL_26]] : f64 // CHECK: } -// CHECK: func.func private @altLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) +// CHECK: func.func private @altLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> // CHECK: func.func private @cudaqRegisterKernelName(!cc.ptr) // CHECK: llvm.mlir.global external constant @ghz.kernelName("ghz\00") {addr_space = 0 : i32} @@ -192,39 +205,52 @@ module attributes {quake.mangled_name_map = { // HYBRID: %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}> // HYBRID: %[[VAL_3:.*]] = arith.constant 0 : i64 // HYBRID: %[[VAL_4:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{i32, f64}>, i32) -> !cc.struct<{i32, f64}> -// HYBRID: %[[VAL_6:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 -// HYBRID: %[[VAL_7:.*]] = arith.addi %[[VAL_6]], %[[VAL_3]] : i64 -// HYBRID: %[[VAL_8:.*]] = cc.alloca i8{{\[}}%[[VAL_7]] : i64] -// HYBRID: %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr>) -> !cc.ptr> -// HYBRID: cc.store %[[VAL_4]], %[[VAL_9]] : !cc.ptr> -// HYBRID: %[[VAL_10:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr>) -> !cc.ptr x ?>> -// HYBRID: %[[VAL_11:.*]] = constant @ghz.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> -// HYBRID: %[[VAL_12:.*]] = cc.func_ptr %[[VAL_11]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// HYBRID: %[[VAL_13:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr x ?>>) -> !cc.ptr -// HYBRID: %[[VAL_15:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64 -// HYBRID: %[[VAL_16:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> -// HYBRID: %[[VAL_17:.*]] = cc.alloca !cc.array x 1> -// HYBRID: %[[VAL_18:.*]] = cc.sizeof !cc.array x 1> : i64 -// HYBRID: %[[VAL_19:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr x 1>>) -> !cc.ptr> -// HYBRID: %[[VAL_20:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// HYBRID: cc.store %[[VAL_19]], %[[VAL_20]] : !cc.ptr>> -// HYBRID: %[[VAL_21:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr x 1>>) -> i64 -// HYBRID: %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_18]] : i64 -// HYBRID: %[[VAL_23:.*]] = cc.cast %[[VAL_22]] : (i64) -> !cc.ptr> -// HYBRID: %[[VAL_24:.*]] = cc.compute_ptr %[[VAL_16]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// HYBRID: cc.store %[[VAL_23]], %[[VAL_24]] : !cc.ptr>> -// HYBRID: %[[VAL_25:.*]] = cc.compute_ptr %[[VAL_16]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// HYBRID: cc.store %[[VAL_23]], %[[VAL_25]] : !cc.ptr>> -// HYBRID: %[[VAL_26:.*]] = cc.compute_ptr %[[VAL_17]][0] : (!cc.ptr x 1>>) -> !cc.ptr> -// HYBRID: %[[VAL_27:.*]] = cc.alloca i32 -// HYBRID: cc.store %[[VAL_1]], %[[VAL_27]] : !cc.ptr -// HYBRID: %[[VAL_28:.*]] = cc.cast %[[VAL_27]] : (!cc.ptr) -> !cc.ptr -// HYBRID: cc.store %[[VAL_28]], %[[VAL_26]] : !cc.ptr> -// HYBRID: %[[VAL_29:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr -// HYBRID: %[[VAL_30:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> -// HYBRID: %[[VAL_31:.*]] = cc.cast %[[VAL_30]] : (!llvm.ptr>) -> !cc.ptr -// HYBRID: call @hybridLaunchKernel(%[[VAL_31]], %[[VAL_12]], %[[VAL_13]], %[[VAL_7]], %[[VAL_15]], %[[VAL_29]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> () -// HYBRID: %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_10]][0, 1] : (!cc.ptr x ?>>) -> !cc.ptr -// HYBRID: %[[VAL_33:.*]] = cc.load %[[VAL_32]] : !cc.ptr -// HYBRID: return %[[VAL_33]] : f64 +// HYBRID: %[[VAL_5:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 +// HYBRID: %[[VAL_6:.*]] = arith.addi %[[VAL_5]], %[[VAL_3]] : i64 +// HYBRID: %[[VAL_7:.*]] = cc.alloca i8{{\[}}%[[VAL_6]] : i64] +// HYBRID: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr>) -> !cc.ptr> +// HYBRID: cc.store %[[VAL_4]], %[[VAL_8]] : !cc.ptr> +// HYBRID: %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr>) -> !cc.ptr x ?>> +// HYBRID: %[[VAL_10:.*]] = constant @ghz.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// HYBRID: %[[VAL_11:.*]] = cc.func_ptr %[[VAL_10]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// HYBRID: %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr x ?>>) -> !cc.ptr +// HYBRID: %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64 +// HYBRID: %[[VAL_14:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> +// HYBRID: %[[VAL_15:.*]] = cc.alloca !cc.array x 1> +// HYBRID: %[[VAL_16:.*]] = cc.sizeof !cc.array x 1> : i64 +// HYBRID: %[[VAL_17:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr x 1>>) -> !cc.ptr> +// HYBRID: %[[VAL_18:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// HYBRID: cc.store %[[VAL_17]], %[[VAL_18]] : !cc.ptr>> +// HYBRID: %[[VAL_19:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr x 1>>) -> i64 +// HYBRID: %[[VAL_20:.*]] = arith.addi %[[VAL_19]], %[[VAL_16]] : i64 +// HYBRID: %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (i64) -> !cc.ptr> +// HYBRID: %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_14]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// HYBRID: cc.store %[[VAL_21]], %[[VAL_22]] : !cc.ptr>> +// HYBRID: %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_14]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// HYBRID: cc.store %[[VAL_21]], %[[VAL_23]] : !cc.ptr>> +// HYBRID: %[[VAL_24:.*]] = cc.compute_ptr %[[VAL_15]][0] : (!cc.ptr x 1>>) -> !cc.ptr> +// HYBRID: %[[VAL_25:.*]] = cc.alloca i32 +// HYBRID: cc.store %[[VAL_1]], %[[VAL_25]] : !cc.ptr +// HYBRID: %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr) -> !cc.ptr +// HYBRID: cc.store %[[VAL_26]], %[[VAL_24]] : !cc.ptr> +// HYBRID: %[[VAL_27:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr +// HYBRID: %[[VAL_28:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> +// HYBRID: %[[VAL_29:.*]] = cc.cast %[[VAL_28]] : (!llvm.ptr>) -> !cc.ptr +// HYBRID: %[[VAL_30:.*]] = call @hybridLaunchKernel(%[[VAL_29]], %[[VAL_11]], %[[VAL_12]], %[[VAL_6]], %[[VAL_13]], %[[VAL_27]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// HYBRID: %[[VAL_31:.*]] = cc.extract_value %[[VAL_30]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// HYBRID: %[[VAL_32:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr) -> i64 +// HYBRID: %[[VAL_33:.*]] = arith.constant 0 : i64 +// HYBRID: %[[VAL_34:.*]] = arith.cmpi ne, %[[VAL_32]], %[[VAL_33]] : i64 +// HYBRID: cf.cond_br %[[VAL_34]], ^bb1, ^bb2 +// HYBRID: ^bb1: +// HYBRID: %[[VAL_35:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr) -> !cc.ptr> +// HYBRID: %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_35]][1] : (!cc.ptr>) -> !cc.ptr +// HYBRID: cf.br ^bb3(%[[VAL_36]] : !cc.ptr) +// HYBRID: ^bb2: +// HYBRID: %[[VAL_37:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr x ?>>) -> !cc.ptr +// HYBRID: cf.br ^bb3(%[[VAL_37]] : !cc.ptr) +// HYBRID: ^bb3(%[[VAL_38:.*]]: !cc.ptr): +// HYBRID: %[[VAL_39:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr x ?>>) -> !cc.ptr +// HYBRID: %[[VAL_40:.*]] = cc.load %[[VAL_39]] : !cc.ptr +// HYBRID: return %[[VAL_40]] : f64 // HYBRID: } diff --git a/test/Quake/kernel_exec-2.qke b/test/Quake/kernel_exec-2.qke index 4e53513774..a9b04b8449 100644 --- a/test/Quake/kernel_exec-2.qke +++ b/test/Quake/kernel_exec-2.qke @@ -71,11 +71,11 @@ __nvqpp__mlirgen__function_cargo = "pants"}} { // CHECK: %[[VAL_33:.*]] = arith.constant 2147483647 : i64 // CHECK: %[[VAL_28:.*]] = llvm.mlir.addressof @function_hawaiian.kernelName : !llvm.ptr> // CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_28]] : (!llvm.ptr>) -> !cc.ptr -// CHECK: call @altLaunchKernel(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_17]], %[[VAL_33]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> () +// CHECK: call @altLaunchKernel(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_17]], %[[VAL_33]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> // CHECK: return // CHECK: } -// CHECK-DAG: func.func private @altLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) +// CHECK-DAG: func.func private @altLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> // CHECK-DAG: func.func private @cudaqRegisterKernelName(!cc.ptr) // CHECK-DAG: func.func private @cudaqRegisterArgsCreator(!cc.ptr, !cc.ptr) // CHECK-DAG: func.func private @malloc(i64) -> !cc.ptr diff --git a/test/Quake/return_vector.qke b/test/Quake/return_vector.qke index a13d0b6abe..90ccc90610 100644 --- a/test/Quake/return_vector.qke +++ b/test/Quake/return_vector.qke @@ -6,8 +6,8 @@ // the terms of the Apache License 2.0 which accompanies this distribution. // // ========================================================================== // -// RUN: cudaq-opt --add-dealloc --kernel-execution=codegen=1 --canonicalize %s | \ -// RUN: FileCheck %s +// RUN: cudaq-opt --add-dealloc --kernel-execution=codegen=1 --canonicalize %s \ +// RUN: | FileCheck %s // NB: the mangled name map is required for the kernel-execution pass. module attributes{ quake.mangled_name_map = { @@ -42,37 +42,48 @@ func.func @test_0(%0: !cc.ptr, !cc.ptr, !cc.ptr, %[[VAL_2:.*]]: i32) { // CHECK: %[[VAL_3:.*]] = arith.constant 4 : i64 // CHECK: %[[VAL_4:.*]] = constant @test_0.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_5:.*]] = arith.constant 0 : i64 // CHECK: %[[VAL_6:.*]] = cc.undef !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> // CHECK: %[[VAL_7:.*]] = cc.insert_value %[[VAL_2]], %[[VAL_6]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>, i32) -> !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> -// CHECK: %[[VAL_9:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 -// CHECK: %[[VAL_10:.*]] = cc.alloca i8{{\[}}%[[VAL_9]] : i64] -// CHECK: %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: cc.store %[[VAL_7]], %[[VAL_11]] : !cc.ptr, i64}>}>> -// CHECK: %[[VAL_14:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_15:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr -// CHECK: %[[VAL_17:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 -// CHECK: %[[VAL_12:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr> -// CHECK: %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr>) -> !cc.ptr -// CHECK: call @altLaunchKernel(%[[VAL_13]], %[[VAL_14]], %[[VAL_15]], %[[VAL_9]], %[[VAL_17]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> () -// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_18]][1, 0] : (!cc.ptr, i64}>}>>) -> !cc.ptr> -// CHECK: %[[VAL_20:.*]] = cc.load %[[VAL_19]] : !cc.ptr> -// CHECK: %[[VAL_21:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_21]][1, 1] : (!cc.ptr, i64}>}>>) -> !cc.ptr -// CHECK: %[[VAL_23:.*]] = cc.load %[[VAL_22]] : !cc.ptr -// CHECK: %[[VAL_24:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> -// CHECK: %[[VAL_25:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr> -// CHECK: call @free(%[[VAL_26]]) : (!cc.ptr) -> () -// CHECK: %[[VAL_27:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr) -> !cc.ptr -// CHECK: cc.store %[[VAL_27]], %[[VAL_25]] : !cc.ptr> -// CHECK: %[[VAL_28:.*]] = cc.compute_ptr %[[VAL_24]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_29:.*]] = arith.muli %[[VAL_23]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_31:.*]] = cc.compute_ptr %[[VAL_30]]{{\[}}%[[VAL_29]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: cc.store %[[VAL_31]], %[[VAL_28]] : !cc.ptr> -// CHECK: %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_24]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_31]], %[[VAL_32]] : !cc.ptr> +// CHECK: %[[VAL_8:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_9:.*]] = cc.alloca i8{{\[}}%[[VAL_8]] : i64] +// CHECK: %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> +// CHECK: cc.store %[[VAL_7]], %[[VAL_10]] : !cc.ptr, i64}>}>> +// CHECK: %[[VAL_11:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 +// CHECK: %[[VAL_14:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr> +// CHECK: %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!llvm.ptr>) -> !cc.ptr +// CHECK: %[[VAL_16:.*]] = call @altLaunchKernel(%[[VAL_15]], %[[VAL_11]], %[[VAL_12]], %[[VAL_8]], %[[VAL_13]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_17:.*]] = cc.extract_value %[[VAL_16]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_19:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_5]] : i64 +// CHECK: cf.cond_br %[[VAL_19]], ^bb1, ^bb2 +// CHECK: ^bb1: +// CHECK: %[[VAL_20:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_21:.*]] = cc.compute_ptr %[[VAL_20]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_21]] : !cc.ptr, i64}>>) +// CHECK: ^bb2: +// CHECK: %[[VAL_22:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_22]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_23]] : !cc.ptr, i64}>>) +// CHECK: ^bb3(%[[VAL_24:.*]]: !cc.ptr, i64}>>): +// CHECK: %[[VAL_25:.*]] = cc.cast %[[VAL_24]] : (!cc.ptr, i64}>>) -> !cc.ptr> +// CHECK: %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr> +// CHECK: %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_24]][1] : (!cc.ptr, i64}>>) -> !cc.ptr +// CHECK: %[[VAL_28:.*]] = cc.load %[[VAL_27]] : !cc.ptr +// CHECK: %[[VAL_29:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> +// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_31:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_31]], %[[VAL_30]] : !cc.ptr> +// CHECK: %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_29]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_33:.*]] = arith.muli %[[VAL_28]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_34:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_35:.*]] = cc.compute_ptr %[[VAL_34]]{{\[}}%[[VAL_33]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: cc.store %[[VAL_35]], %[[VAL_32]] : !cc.ptr> +// CHECK: %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_29]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_35]], %[[VAL_36]] : !cc.ptr> +// CHECK: call @free(%[[VAL_17]]) : (!cc.ptr) -> () // CHECK: return // CHECK: } @@ -102,37 +113,48 @@ func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr, %[[VAL_2:.*]]: i32) { // CHECK: %[[VAL_3:.*]] = arith.constant 8 : i64 // CHECK: %[[VAL_4:.*]] = constant @test_1.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_5:.*]] = arith.constant 0 : i64 // CHECK: %[[VAL_6:.*]] = cc.undef !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> // CHECK: %[[VAL_7:.*]] = cc.insert_value %[[VAL_2]], %[[VAL_6]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>, i32) -> !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> -// CHECK: %[[VAL_9:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 -// CHECK: %[[VAL_10:.*]] = cc.alloca i8{{\[}}%[[VAL_9]] : i64] -// CHECK: %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: cc.store %[[VAL_7]], %[[VAL_11]] : !cc.ptr, i64}>}>> -// CHECK: %[[VAL_14:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_15:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr -// CHECK: %[[VAL_17:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 -// CHECK: %[[VAL_12:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr> -// CHECK: %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr>) -> !cc.ptr -// CHECK: call @altLaunchKernel(%[[VAL_13]], %[[VAL_14]], %[[VAL_15]], %[[VAL_9]], %[[VAL_17]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> () -// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_18]][1, 0] : (!cc.ptr, i64}>}>>) -> !cc.ptr> -// CHECK: %[[VAL_20:.*]] = cc.load %[[VAL_19]] : !cc.ptr> -// CHECK: %[[VAL_21:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_21]][1, 1] : (!cc.ptr, i64}>}>>) -> !cc.ptr -// CHECK: %[[VAL_23:.*]] = cc.load %[[VAL_22]] : !cc.ptr -// CHECK: %[[VAL_24:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> -// CHECK: %[[VAL_25:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr> -// CHECK: call @free(%[[VAL_26]]) : (!cc.ptr) -> () -// CHECK: %[[VAL_27:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr) -> !cc.ptr -// CHECK: cc.store %[[VAL_27]], %[[VAL_25]] : !cc.ptr> -// CHECK: %[[VAL_28:.*]] = cc.compute_ptr %[[VAL_24]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_29:.*]] = arith.muli %[[VAL_23]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_31:.*]] = cc.compute_ptr %[[VAL_30]]{{\[}}%[[VAL_29]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: cc.store %[[VAL_31]], %[[VAL_28]] : !cc.ptr> -// CHECK: %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_24]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_31]], %[[VAL_32]] : !cc.ptr> +// CHECK: %[[VAL_8:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_9:.*]] = cc.alloca i8{{\[}}%[[VAL_8]] : i64] +// CHECK: %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> +// CHECK: cc.store %[[VAL_7]], %[[VAL_10]] : !cc.ptr, i64}>}>> +// CHECK: %[[VAL_11:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 +// CHECK: %[[VAL_14:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr> +// CHECK: %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!llvm.ptr>) -> !cc.ptr +// CHECK: %[[VAL_16:.*]] = call @altLaunchKernel(%[[VAL_15]], %[[VAL_11]], %[[VAL_12]], %[[VAL_8]], %[[VAL_13]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_17:.*]] = cc.extract_value %[[VAL_16]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_19:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_5]] : i64 +// CHECK: cf.cond_br %[[VAL_19]], ^bb1, ^bb2 +// CHECK: ^bb1: +// CHECK: %[[VAL_20:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_21:.*]] = cc.compute_ptr %[[VAL_20]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_21]] : !cc.ptr, i64}>>) +// CHECK: ^bb2: +// CHECK: %[[VAL_22:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_22]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_23]] : !cc.ptr, i64}>>) +// CHECK: ^bb3(%[[VAL_24:.*]]: !cc.ptr, i64}>>): +// CHECK: %[[VAL_25:.*]] = cc.cast %[[VAL_24]] : (!cc.ptr, i64}>>) -> !cc.ptr> +// CHECK: %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr> +// CHECK: %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_24]][1] : (!cc.ptr, i64}>>) -> !cc.ptr +// CHECK: %[[VAL_28:.*]] = cc.load %[[VAL_27]] : !cc.ptr +// CHECK: %[[VAL_29:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> +// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_31:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_31]], %[[VAL_30]] : !cc.ptr> +// CHECK: %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_29]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_33:.*]] = arith.muli %[[VAL_28]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_34:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_35:.*]] = cc.compute_ptr %[[VAL_34]]{{\[}}%[[VAL_33]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: cc.store %[[VAL_35]], %[[VAL_32]] : !cc.ptr> +// CHECK: %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_29]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_35]], %[[VAL_36]] : !cc.ptr> +// CHECK: call @free(%[[VAL_17]]) : (!cc.ptr) -> () // CHECK: return // CHECK: } @@ -140,40 +162,42 @@ func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr, %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { -// CHECK: %[[VAL_3:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr, i64}>}>> -// CHECK: %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 -// CHECK: %[[VAL_10:.*]] = cc.extract_value %[[VAL_4]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>) -> i32 -// CHECK: %[[VAL_15:.*]] = call @__nvqpp__mlirgen__test_0(%[[VAL_10]]) : (i32) -> !cc.stdvec +// CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr, i64}>}>> +// CHECK: %[[VAL_4:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_5:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>) -> i32 +// CHECK: %[[VAL_6:.*]] = call @__nvqpp__mlirgen__test_0(%[[VAL_5]]) : (i32) -> !cc.stdvec +// CHECK: %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr, i64}>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_6]], %[[VAL_8]] : !cc.ptr> // CHECK: cf.cond_br %[[VAL_1]], ^bb1, ^bb2 // CHECK: ^bb1: -// CHECK: %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_3]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}> -// CHECK: %[[VAL_16:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr, i64}>>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_15]], %[[VAL_16]] : !cc.ptr> -// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr, i64}>>) -> !cc.ptr, i64}>> -// CHECK: %[[VAL_13:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_7]], %[[VAL_12]]) : (!cc.ptr, i64, !cc.ptr, i64}>>) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: return %[[VAL_13]] : !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr, i64}>>) -> !cc.ptr, i64}>> +// CHECK: %[[VAL_10:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 +// CHECK: %[[VAL_11:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_4]], %[[VAL_9]], %[[VAL_10]]) : (!cc.ptr, i64, !cc.ptr, i64}>>, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: return %[[VAL_11]] : !cc.struct<{!cc.ptr, i64}> // CHECK: ^bb2: -// CHECK: %[[VAL_14:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> -// CHECK: return %[[VAL_14]] : !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_12:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> +// CHECK: return %[[VAL_12]] : !cc.struct<{!cc.ptr, i64}> // CHECK: } // CHECK-LABEL: func.func @test_1.thunk( // CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { -// CHECK: %[[VAL_3:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr, i64}>}>> -// CHECK: %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 -// CHECK: %[[VAL_10:.*]] = cc.extract_value %[[VAL_4]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>) -> i32 -// CHECK: %[[VAL_15:.*]] = call @__nvqpp__mlirgen__test_1(%[[VAL_10]]) : (i32) -> !cc.stdvec +// CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr, i64}>}>> +// CHECK: %[[VAL_4:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_5:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>) -> i32 +// CHECK: %[[VAL_6:.*]] = call @__nvqpp__mlirgen__test_1(%[[VAL_5]]) : (i32) -> !cc.stdvec +// CHECK: %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr, i64}>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_6]], %[[VAL_8]] : !cc.ptr> // CHECK: cf.cond_br %[[VAL_1]], ^bb1, ^bb2 // CHECK: ^bb1: -// CHECK: %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_3]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> -// CHECK: %[[VAL_16:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr, i64}>>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_15]], %[[VAL_16]] : !cc.ptr> -// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr, i64}>>) -> !cc.ptr, i64}>> -// CHECK: %[[VAL_13:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_7]], %[[VAL_12]]) : (!cc.ptr, i64, !cc.ptr, i64}>>) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: return %[[VAL_13]] : !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr, i64}>>) -> !cc.ptr, i64}>> +// CHECK: %[[VAL_10:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 +// CHECK: %[[VAL_11:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_4]], %[[VAL_9]], %[[VAL_10]]) : (!cc.ptr, i64, !cc.ptr, i64}>>, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: return %[[VAL_11]] : !cc.struct<{!cc.ptr, i64}> // CHECK: ^bb2: -// CHECK: %[[VAL_14:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> -// CHECK: return %[[VAL_14]] : !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_12:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> +// CHECK: return %[[VAL_12]] : !cc.struct<{!cc.ptr, i64}> // CHECK: }