diff --git a/docs/sphinx/api/languages/cpp_api.rst b/docs/sphinx/api/languages/cpp_api.rst index 1a26fb2d41..7f4728199e 100644 --- a/docs/sphinx/api/languages/cpp_api.rst +++ b/docs/sphinx/api/languages/cpp_api.rst @@ -190,6 +190,10 @@ Platform .. doxygentypedef:: cudaq::KernelExecutionTask +.. doxygenstruct:: cudaq::KernelThunkResultType + +.. doxygentypedef:: cudaq::KernelThunkType + Utilities ========= diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp index 12e430dc03..a6cc0ae477 100644 --- a/lib/Optimizer/Builder/Intrinsics.cpp +++ b/lib/Optimizer/Builder/Intrinsics.cpp @@ -232,9 +232,14 @@ static constexpr IntrinsicCode intrinsicTable[] = { })#"}, {"__nvqpp_createDynamicResult", + /* arguments: + arg0: original buffer ptr + arg1: original buffer size + arg2: ptr to span of the return data: {ptr, bytes} + arg3: offset to result slot in buffer */ {cudaq::llvmMemCopyIntrinsic, "malloc"}, R"#( - func.func private @__nvqpp_createDynamicResult(%arg0: !cc.ptr, %arg1: i64, %arg2: !cc.ptr, i64}>>) -> !cc.struct<{!cc.ptr, i64}> { + func.func private @__nvqpp_createDynamicResult(%arg0: !cc.ptr, %arg1: i64, %arg2: !cc.ptr, i64}>>, %arg3: i64) -> !cc.struct<{!cc.ptr, i64}> { %0 = cc.compute_ptr %arg2[1] : (!cc.ptr, i64}>>) -> !cc.ptr %1 = cc.load %0 : !cc.ptr %2 = arith.addi %arg1, %1 : i64 @@ -249,6 +254,9 @@ static constexpr IntrinsicCode intrinsicTable[] = { %7 = cc.undef !cc.struct<{!cc.ptr, i64}> %8 = cc.insert_value %3, %7[0] : (!cc.struct<{!cc.ptr, i64}>, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> %9 = cc.insert_value %2, %8[1] : (!cc.struct<{!cc.ptr, i64}>, i64) -> !cc.struct<{!cc.ptr, i64}> + %11 = cc.compute_ptr %10[%arg3] : (!cc.ptr>, i64) -> !cc.ptr + %12 = cc.cast %11 : (!cc.ptr) -> !cc.ptr> + cc.store %6, %12 : !cc.ptr> return %9 : !cc.struct<{!cc.ptr, i64}> })#"}, @@ -319,7 +327,7 @@ static constexpr IntrinsicCode intrinsicTable[] = { {cudaq::runtime::launchKernelFuncName, {}, R"#( - func.func private @altLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> ())#"}, + func.func private @altLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}>)#"}, {cudaq::runtime::CudaqRegisterArgsCreator, {}, @@ -346,7 +354,7 @@ static constexpr IntrinsicCode intrinsicTable[] = { {cudaq::runtime::launchKernelHybridFuncName, {}, R"#( - func.func private @hybridLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> ())#"}, + func.func private @hybridLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}>)#"}, {cudaq::llvmMemCopyIntrinsic, // llvm.memcpy.p0i8.p0i8.i64 {}, diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp index a4667ce7b5..2e45c8df96 100644 --- a/lib/Optimizer/Transforms/GenKernelExecution.cpp +++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp @@ -48,6 +48,12 @@ static bool isCodegenArgumentGather(std::size_t kind) { return kind == 0 || kind == 2; } +/// This pass adds a `.thunk` function and a rewritten C++ host +/// side (mangled) stub to the code for every entry-point kernel in the module. +/// It may also generate a `.argsCreator` function. Finally, it +/// creates registration hooks for the CUDA-Q runtime to be able to find the +/// kernel by name and, as appropriate, the `.argsCreator` +/// function. namespace { class GenerateKernelExecution : public cudaq::opt::impl::GenerateKernelExecutionBase< @@ -57,6 +63,19 @@ class GenerateKernelExecution /// Creates the function signature for a thunk function. The signature is /// always the same for all thunk functions. + /// + /// Every thunk function has an identical signature, making it callable from a + /// generic "kernel launcher" in the CUDA-Q runtime. + /// + /// This signature is defined as: `(ptr, bool) -> {ptr, i64}`. + /// + /// The first argument is a pointer to a data buffer that encodes all the + /// arguments (and static return) values to (and from) the kernel in the + /// pointer-free encoding. The second argument indicates if this call is to a + /// remote process (if true). The result is a pointer and size (span) if the + /// kernel returns a dynamically sized result, otherwise it will be + /// `{nullptr, 0}`. It is the responsibility of calling code to free any + /// dynamic result buffer(s) and convert those to `std::vector` objects. FunctionType getThunkType(MLIRContext *ctx) { auto ptrTy = cudaq::cc::PointerType::get(IntegerType::get(ctx, 8)); return FunctionType::get(ctx, {ptrTy, IntegerType::get(ctx, 1)}, @@ -769,31 +788,32 @@ class GenerateKernelExecution auto *thenBlock = builder.createBlock(reg); auto *elseBlock = builder.createBlock(reg); builder.setInsertionPointToEnd(currentBlock); + auto eleTy = structTy.getMember(offset); + auto memTy = cudaq::cc::PointerType::get(eleTy); + auto mem = builder.create( + loc, memTy, castOp, SmallVector{offset}); + auto resPtrTy = cudaq::cc::PointerType::get(call.getResult(0).getType()); + auto castMem = builder.create(loc, resPtrTy, mem); + builder.create(loc, call.getResult(0), castMem); builder.create(loc, isClientServer, thenBlock, elseBlock); builder.setInsertionPointToEnd(thenBlock); - auto gepRes = builder.create( - loc, cudaq::cc::PointerType::get(structTy.getMember(offset)), castOp, - ArrayRef{offset}); - auto resAsVec = builder.create( - loc, cudaq::cc::PointerType::get(funcTy.getResult(0)), gepRes); - builder.create(loc, call.getResult(0), resAsVec); auto resAsArg = builder.create( - loc, cudaq::cc::PointerType::get(thunkTy.getResults()[0]), gepRes); - // createDynamicResult packs the input values and the dynamic results - // into a single buffer to pass back as a message. + loc, cudaq::cc::PointerType::get(thunkTy.getResults()[0]), mem); + auto retOffset = genComputeReturnOffset(loc, builder, funcTy, structTy); + // createDynamicResult allocates a new buffer and packs the input values + // and the dynamic results into this single new buffer to pass back as a + // message. + // NB: This code only handles one dimensional vectors of static types. It + // will have to be changed if there is a need to return recursively + // dynamic structures, i.e., vectors of vectors. auto res = builder.create( loc, thunkTy.getResults()[0], "__nvqpp_createDynamicResult", - ValueRange{thunkEntry->getArgument(0), structSize, resAsArg}); + ValueRange{thunkEntry->getArgument(0), structSize, resAsArg, + retOffset}); builder.create(loc, res.getResult(0)); builder.setInsertionPointToEnd(elseBlock); - auto eleTy = structTy.getMember(offset); - auto memTy = cudaq::cc::PointerType::get(eleTy); - auto mem = builder.create( - loc, memTy, castOp, SmallVector{offset}); - auto resPtrTy = cudaq::cc::PointerType::get(call.getResult(0).getType()); - auto castMem = builder.create(loc, resPtrTy, mem); - builder.create(loc, call.getResult(0), castMem); + // For the else case, the span was already copied to the block. } else { // FIXME: Should check for recursive vector case. // If the kernel returns non-dynamic results (no spans), then take those @@ -854,8 +874,6 @@ class GenerateKernelExecution auto ptrPtrTy = cudaq::cc::PointerType::get(ptrTy); auto sret0 = builder.create( loc, ptrPtrTy, castSret, SmallVector{0}); - Value vecPtr = builder.create(loc, ptrTy, sret0); - builder.create(loc, std::nullopt, "free", ValueRange{vecPtr}); auto arrI8Ty = cudaq::cc::ArrayType::get(i8Ty); auto ptrArrTy = cudaq::cc::PointerType::get(arrI8Ty); auto buffPtr0 = builder.create(loc, ptrTy, data); @@ -1338,21 +1356,72 @@ class GenerateKernelExecution auto castLoadKernName = builder.create(loc, ptrI8Ty, loadKernName); + auto hostFuncTy = hostFunc.getFunctionType(); + assert((hostFuncTy.getResults().empty() || + (hostFuncTy.getNumResults() == 1)) && + "C++ function expected to have 0 or 1 return value"); + const bool resultVal = !hostFuncTy.getResults().empty(); + const bool kernelReturnsValue = + resultVal || cudaq::opt::factory::hasSRet(hostFunc); + Value launchResult; + Value launchResultToFree; + auto decodeLaunchResults = [&](Value spanReturned) { + if (!kernelReturnsValue) + return; + Type res0Ty = structTy.getMember(offset); + auto ptrResTy = cudaq::cc::PointerType::get(res0Ty); + auto rptr = builder.create(loc, ptrI8Ty, + spanReturned, 0); + launchResultToFree = rptr; + auto rIntPtr = builder.create(loc, i64Ty, rptr); + auto zero = builder.create(loc, 0, 64); + auto cmp = builder.create(loc, arith::CmpIPredicate::ne, + rIntPtr, zero); + auto *currentBlock = builder.getBlock(); + auto *reg = currentBlock->getParent(); + auto *thenBlock = builder.createBlock(reg); + auto *elseBlock = builder.createBlock(reg); + auto *endifBlock = builder.createBlock( + reg, reg->end(), TypeRange{ptrResTy}, SmallVector(1, loc)); + builder.setInsertionPointToEnd(currentBlock); + builder.create(loc, cmp, thenBlock, elseBlock); + builder.setInsertionPointToEnd(thenBlock); + // dynamic result was returned. + // We need to free() this buffer before the end of this function. + auto rStructPtr = + builder.create(loc, structPtrTy, rptr); + Value lRes = builder.create( + loc, ptrResTy, rStructPtr, + ArrayRef{offset}); + builder.create(loc, endifBlock, ArrayRef{lRes}); + builder.setInsertionPointToEnd(elseBlock); + // span was returned in the original buffer. + Value mRes = builder.create( + loc, ptrResTy, temp, ArrayRef{0, offset}); + builder.create(loc, endifBlock, ArrayRef{mRes}); + builder.setInsertionPointToEnd(endifBlock); + launchResult = endifBlock->getArgument(0); + }; + // Generate the call to `launchKernel`. switch (codegenKind) { case 0: { assert(vecArgPtrs && castLoadThunk); - builder.create( - loc, std::nullopt, cudaq::runtime::launchKernelHybridFuncName, + auto launch = builder.create( + loc, cudaq::opt::factory::getDynamicBufferType(ctx), + cudaq::runtime::launchKernelHybridFuncName, ArrayRef{castLoadKernName, castLoadThunk, castTemp, extendedStructSize, resultOffset, vecArgPtrs}); + decodeLaunchResults(launch.getResult(0)); } break; case 1: { assert(!vecArgPtrs && castLoadThunk); - builder.create( - loc, std::nullopt, cudaq::runtime::launchKernelFuncName, + auto launch = builder.create( + loc, cudaq::opt::factory::getDynamicBufferType(ctx), + cudaq::runtime::launchKernelFuncName, ArrayRef{castLoadKernName, castLoadThunk, castTemp, extendedStructSize, resultOffset}); + decodeLaunchResults(launch.getResult(0)); } break; case 2: { assert(vecArgPtrs && !castLoadThunk); @@ -1377,17 +1446,13 @@ class GenerateKernelExecution // result value(s) from the struct returned by `launchKernel` and return // them to our caller. SmallVector results; - auto hostFuncTy = hostFunc.getFunctionType(); - assert((hostFuncTy.getResults().empty() || - (hostFuncTy.getNumResults() == 1)) && - "C++ function expected to have 0 or 1 return value"); - const bool resultVal = !hostFuncTy.getResults().empty(); - if (resultVal || cudaq::opt::factory::hasSRet(hostFunc)) { + if (kernelReturnsValue) { + Type res0Ty = structTy.getMember(offset); + auto ptrResTy = cudaq::cc::PointerType::get(res0Ty); // Host function returns a value. Either returning by value or via an sret // reference. if (resultVal) { - Type res0Ty = structTy.getMember(offset); - auto ptrResTy = cudaq::cc::PointerType::get(res0Ty); + // Static values. std::vector are necessarily sret, see below. auto resPtr = builder.create( loc, ptrResTy, temp, ArrayRef{0, offset}); Type castToTy = cudaq::cc::PointerType::get(hostFuncTy.getResult(0)); @@ -1398,22 +1463,22 @@ class GenerateKernelExecution }(); results.push_back(builder.create(loc, castResPtr)); } else { - // Check if device is returning a span. If it is, then we will need to - // convert it to a std::vector here. The vector is constructed in-place - // on the sret memory block. + // This is an sret return. Check if device is returning a span. If it + // is, then we will need to convert it to a std::vector here. The vector + // is constructed in-place on the sret memory block. Value arg0 = hostFuncEntryBlock->getArguments().front(); if (auto spanTy = dyn_cast(devFuncTy.getResult(0))) { auto eleTy = spanTy.getElementType(); auto ptrTy = cudaq::cc::PointerType::get(eleTy); auto gep0 = builder.create( - loc, cudaq::cc::PointerType::get(ptrTy), temp, - SmallVector{0, offset, 0}); + loc, cudaq::cc::PointerType::get(ptrTy), launchResult, + SmallVector{0}); auto dataPtr = builder.create(loc, gep0); auto lenPtrTy = cudaq::cc::PointerType::get(i64Ty); auto gep1 = builder.create( - loc, lenPtrTy, temp, - SmallVector{0, offset, 1}); + loc, lenPtrTy, launchResult, + SmallVector{1}); auto vecLen = builder.create(loc, gep1); if (spanTy.getElementType() == builder.getI1Type()) { genStdvecBoolFromInitList(loc, builder, arg0, dataPtr, vecLen); @@ -1422,13 +1487,14 @@ class GenerateKernelExecution builder.create(loc, i64Ty, eleTy); genStdvecTFromInitList(loc, builder, arg0, dataPtr, tSize, vecLen); } + // free(nullptr) is defined to be a nop in the standard. + builder.create(loc, std::nullopt, "free", + ArrayRef{launchResultToFree}); } else { // Otherwise, we can just copy the aggregate into the sret memory // block. Uses the size of the host function's sret pointer element // type for the memcpy, so the device should return an (aggregate) // value of suitable size. - Type res0Ty = structTy.getMember(offset); - auto ptrResTy = cudaq::cc::PointerType::get(res0Ty); auto resPtr = builder.create( loc, ptrResTy, temp, ArrayRef{0, offset}); diff --git a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp index 9328b78896..689be49998 100644 --- a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp +++ b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp @@ -72,19 +72,17 @@ LogicalResult genVectorOfConstantsFromAttributes(cudaq::IRBuilder irBuilder, return success(); } } - } else if (auto floatTy = dyn_cast(eleTy)) { - if (floatTy == irBuilder.getF64Type()) { - auto vals = readConstantValues(values, floatTy); - if (vals.size() == values.size()) { - irBuilder.genVectorOfConstants(loc, module, name, vals); - return success(); - } - } else if (floatTy == irBuilder.getF32Type()) { - auto vals = readConstantValues(values, floatTy); - if (vals.size() == values.size()) { - irBuilder.genVectorOfConstants(loc, module, name, vals); - return success(); - } + } else if (eleTy == irBuilder.getF64Type()) { + auto vals = readConstantValues(values, eleTy); + if (vals.size() == values.size()) { + irBuilder.genVectorOfConstants(loc, module, name, vals); + return success(); + } + } else if (eleTy == irBuilder.getF32Type()) { + auto vals = readConstantValues(values, eleTy); + if (vals.size() == values.size()) { + irBuilder.genVectorOfConstants(loc, module, name, vals); + return success(); } } return failure(); @@ -147,7 +145,9 @@ class AllocaPattern : public OpRewritePattern { rewriter.create(loc, arrTy, valuesAttr); } - SmallVector toErase; + assert(conArr && "must have created the constant array"); + LLVM_DEBUG(llvm::dbgs() << "constant array is:\n" << conArr << '\n'); + bool cannotEraseAlloc = false; // Rewalk all the uses of alloc, u, which must be cc.cast or cc.compute_ptr. // For each,u, remove a store and replace a load with a cc.extract_value. @@ -176,8 +176,9 @@ class AllocaPattern : public OpRewritePattern { continue; } if (isa(useuser)) - toErase.push_back(useuser); - isLive = true; + rewriter.eraseOp(useuser); + LLVM_DEBUG(llvm::dbgs() << "alloc is live\n"); + cannotEraseAlloc = isLive = true; } if (auto ist = dyn_cast(user)) { rewriter.setInsertionPointAfter(user); @@ -188,20 +189,20 @@ class AllocaPattern : public OpRewritePattern { continue; } if (!isLive) - toErase.push_back(user); - } - if (toGlobal) { - if (conGlobal) { - rewriter.setInsertionPointAfter(alloc); - rewriter.replaceOp(alloc, conGlobal); - } - } else { - toErase.push_back(alloc); + rewriter.eraseOp(user); } - for (auto *op : toErase) - rewriter.eraseOp(op); - + if (toGlobal && conGlobal) { + rewriter.setInsertionPointAfter(alloc); + rewriter.replaceOp(alloc, conGlobal); + return success(); + } + if (cannotEraseAlloc) { + rewriter.setInsertionPointAfter(alloc); + rewriter.create(loc, conArr, alloc); + return success(); + } + rewriter.eraseOp(alloc); return success(); } @@ -305,12 +306,16 @@ class AllocaPattern : public OpRewritePattern { } // Process casts that are used in quake.init_state. if (cast.getType() == ptrUnsizedArrTy) { - if (getWriteOp(cast, 0)) - LLVM_DEBUG( - llvm::dbgs() - << "unexpected use of array size removing cast in a store" - << *op << '\n'); - continue; + if (cast->hasOneUse()) { + auto &use = *cast->getUses().begin(); + Operation *u = use.getOwner(); + if (isa_and_present(u)) { + toGlobalUses.push_back(op); + toGlobal = true; + continue; + } + } + return false; } LLVM_DEBUG(llvm::dbgs() << "unexpected cast: " << *op << '\n'); toGlobalUses.push_back(op); diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp index b91627de9f..5a197f97a6 100644 --- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp +++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp @@ -254,7 +254,7 @@ pyAltLaunchKernelBase(const std::string &name, MlirModule module, if (!thunkPtr) throw std::runtime_error("cudaq::builder failed to get thunk function"); - auto thunk = reinterpret_cast(*thunkPtr); + auto thunk = reinterpret_cast(*thunkPtr); std::string properName = name; @@ -327,15 +327,21 @@ pyAltLaunchKernelBase(const std::string &name, MlirModule module, if (launch) { auto &platform = cudaq::get_platform(); + auto uReturnOffset = static_cast(returnOffset); if (platform.is_remote() || platform.is_emulated()) { auto *wrapper = new cudaq::ArgWrapper{mod, names, rawArgs}; - cudaq::altLaunchKernel(name.c_str(), thunk, - reinterpret_cast(wrapper), size, - (uint64_t)returnOffset); + auto dynamicResult = cudaq::altLaunchKernel( + name.c_str(), thunk, reinterpret_cast(wrapper), size, + uReturnOffset); + if (dynamicResult.data_buffer || dynamicResult.size) + throw std::runtime_error("not implemented: support dynamic results"); delete wrapper; - } else - cudaq::altLaunchKernel(name.c_str(), thunk, rawArgs, size, - (uint64_t)returnOffset); + } else { + auto dynamicResult = cudaq::altLaunchKernel(name.c_str(), thunk, rawArgs, + size, uReturnOffset); + if (dynamicResult.data_buffer || dynamicResult.size) + throw std::runtime_error("not implemented: support dynamic results"); + } } return std::make_tuple(rawArgs, size, returnOffset); diff --git a/python/runtime/utils/PyRemoteSimulatorQPU.cpp b/python/runtime/utils/PyRemoteSimulatorQPU.cpp index 4cc998c363..f767bb652a 100644 --- a/python/runtime/utils/PyRemoteSimulatorQPU.cpp +++ b/python/runtime/utils/PyRemoteSimulatorQPU.cpp @@ -130,16 +130,19 @@ class PyRemoteSimulatorQPU : public cudaq::BaseRemoteSimulatorQPU { kernelArgs, gradient, H, optimizer, n_params, shots); } - void launchKernel(const std::string &name, void (*kernelFunc)(void *), - void *args, std::uint64_t voidStarSize, - std::uint64_t resultOffset, - const std::vector &rawArgs) override { + cudaq::KernelThunkResultType + launchKernel(const std::string &name, cudaq::KernelThunkType kernelFunc, + void *args, std::uint64_t voidStarSize, + std::uint64_t resultOffset, + const std::vector &rawArgs) override { cudaq::info("PyRemoteSimulatorQPU: Launch kernel named '{}' remote QPU {} " "(simulator = {})", name, qpu_id, m_simName); ::launchKernelImpl(getExecutionContextForMyThread(), m_client, m_simName, - name, kernelFunc, args, voidStarSize, resultOffset, - rawArgs); + name, make_degenerate_kernel_type(kernelFunc), args, + voidStarSize, resultOffset, rawArgs); + // TODO: Python should probably support return values too. + return {}; } void launchKernel(const std::string &name, @@ -178,16 +181,19 @@ class PyNvcfSimulatorQPU : public cudaq::BaseNvcfSimulatorQPU { kernelArgs, gradient, H, optimizer, n_params, shots); } - void launchKernel(const std::string &name, void (*kernelFunc)(void *), - void *args, std::uint64_t voidStarSize, - std::uint64_t resultOffset, - const std::vector &rawArgs) override { + cudaq::KernelThunkResultType + launchKernel(const std::string &name, cudaq::KernelThunkType kernelFunc, + void *args, std::uint64_t voidStarSize, + std::uint64_t resultOffset, + const std::vector &rawArgs) override { cudaq::info("PyNvcfSimulatorQPU: Launch kernel named '{}' remote QPU {} " "(simulator = {})", name, qpu_id, m_simName); ::launchKernelImpl(getExecutionContextForMyThread(), m_client, m_simName, - name, kernelFunc, args, voidStarSize, resultOffset, - rawArgs); + name, make_degenerate_kernel_type(kernelFunc), args, + voidStarSize, resultOffset, rawArgs); + // TODO: Python should probably support return values too. + return {}; } void launchKernel(const std::string &name, diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h index 61c26dc791..c65e94c3b6 100644 --- a/runtime/common/BaseRemoteRESTQPU.h +++ b/runtime/common/BaseRemoteRESTQPU.h @@ -578,10 +578,11 @@ class BaseRemoteRESTQPU : public cudaq::QPU { /// the representation required by the targeted backend. Handle all pertinent /// modifications for the execution context as well as asynchronous or /// synchronous invocation. - void launchKernel(const std::string &kernelName, void (*kernelFunc)(void *), - void *args, std::uint64_t voidStarSize, - std::uint64_t resultOffset, - const std::vector &rawArgs) override { + KernelThunkResultType + launchKernel(const std::string &kernelName, KernelThunkType kernelFunc, + void *args, std::uint64_t voidStarSize, + std::uint64_t resultOffset, + const std::vector &rawArgs) override { cudaq::info("launching remote rest kernel ({})", kernelName); // TODO future iterations of this should support non-void return types. @@ -597,6 +598,9 @@ class BaseRemoteRESTQPU : public cudaq::QPU { auto codes = rawArgs.empty() ? lowerQuakeCode(kernelName, args) : lowerQuakeCode(kernelName, rawArgs); completeLaunchKernel(kernelName, std::move(codes)); + + // NB: Kernel should/will never return dynamic results. + return {}; } void completeLaunchKernel(const std::string &kernelName, diff --git a/runtime/common/BaseRemoteSimulatorQPU.h b/runtime/common/BaseRemoteSimulatorQPU.h index 667fba5941..6260f334c1 100644 --- a/runtime/common/BaseRemoteSimulatorQPU.h +++ b/runtime/common/BaseRemoteSimulatorQPU.h @@ -107,22 +107,24 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU { void launchKernel(const std::string &name, const std::vector &rawArgs) override { - launchKernelImpl(name, nullptr, nullptr, 0, 0, &rawArgs); + [[maybe_unused]] auto dynamicResult = + launchKernelImpl(name, nullptr, nullptr, 0, 0, &rawArgs); } - void launchKernel(const std::string &name, void (*kernelFunc)(void *), - void *args, std::uint64_t voidStarSize, - std::uint64_t resultOffset, - const std::vector &rawArgs) override { + KernelThunkResultType + launchKernel(const std::string &name, KernelThunkType kernelFunc, void *args, + std::uint64_t voidStarSize, std::uint64_t resultOffset, + const std::vector &rawArgs) override { // Remote simulation cannot deal with rawArgs. Drop them on the floor. - launchKernelImpl(name, kernelFunc, args, voidStarSize, resultOffset, - nullptr); + return launchKernelImpl(name, kernelFunc, args, voidStarSize, resultOffset, + nullptr); } - void launchKernelImpl(const std::string &name, void (*kernelFunc)(void *), - void *args, std::uint64_t voidStarSize, - std::uint64_t resultOffset, - const std::vector *rawArgs) { + [[nodiscard]] KernelThunkResultType + launchKernelImpl(const std::string &name, KernelThunkType kernelFunc, + void *args, std::uint64_t voidStarSize, + std::uint64_t resultOffset, + const std::vector *rawArgs) { cudaq::info( "BaseRemoteSimulatorQPU: Launch kernel named '{}' remote QPU {} " "(simulator = {})", @@ -132,7 +134,7 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU { getExecutionContextForMyThread(); if (executionContextPtr && executionContextPtr->name == "tracer") { - return; + return {}; } // Default context for a 'fire-and-ignore' kernel launch; i.e., no context @@ -155,7 +157,8 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU { const bool requestOkay = m_client->sendRequest( *m_mlirContext, executionContext, /*serializedCodeContext=*/nullptr, /*vqe_gradient=*/nullptr, /*vqe_optimizer=*/nullptr, /*vqe_n_params=*/0, - m_simName, name, kernelFunc, args, voidStarSize, &errorMsg, rawArgs); + m_simName, name, make_degenerate_kernel_type(kernelFunc), args, + voidStarSize, &errorMsg, rawArgs); if (!requestOkay) throw std::runtime_error("Failed to launch kernel. Error: " + errorMsg); if (isDirectInvocation && @@ -182,6 +185,9 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU { executionContext.invocationResultBuffer.size()); executionContext.invocationResultBuffer.clear(); } + + // Assumes kernel has no dynamic results. (Static result handled above.) + return {}; } void diff --git a/runtime/common/KernelWrapper.h b/runtime/common/KernelWrapper.h index 6c30efa58d..2e82522d91 100644 --- a/runtime/common/KernelWrapper.h +++ b/runtime/common/KernelWrapper.h @@ -537,17 +537,19 @@ std::invoke_result_t invokeKernel(QuantumKernel &&fn, // For raw function pointers, i.e., kernels described as free functions, we // send on the function pointer to the platform to retrieve the symbol name // since the typeid of a function only contains signature info. - if constexpr (std::is_class_v>) + if constexpr (std::is_class_v>) { // FIXME: this shouldn't use the serialization code any longer. It should // build a vector of void* and pass that instead. cudaq::get_platform().launchKernel(cudaq::getKernelName(fn), nullptr, (void *)serializedArgsBuffer.data(), serializedArgsBuffer.size(), 0, {}); - else + } else { cudaq::get_platform().launchKernel( - cudaq::getKernelName(fn), reinterpret_cast(&fn), + cudaq::getKernelName(fn), + reinterpret_cast(&fn), (void *)serializedArgsBuffer.data(), serializedArgsBuffer.size(), 0, {}); + } } #else return fn(std::forward(args)...); diff --git a/runtime/common/ThunkInterface.h b/runtime/common/ThunkInterface.h new file mode 100644 index 0000000000..05aeec37a3 --- /dev/null +++ b/runtime/common/ThunkInterface.h @@ -0,0 +1,44 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include + +namespace cudaq { + +/// A kernel may return results dynamically if the size of the result is not a +/// constant at compile-time. +struct KernelThunkResultType { + void *data_buffer; ///< Pointer to the first element of an array. + std::uint64_t size; ///< The size of the buffer in bytes. +}; + +/// The universal signature of a kernel thunk. +using KernelThunkType = KernelThunkResultType (*)(void *, bool); + +/// The degenerate form of a kernel call. In some launch cases, it may be +/// predetermined that the kernel can be called without a thunk. +using KernelDegenerateType = void (*)(void *); + +/// In some cases, the launcher will bypass the thunk function and call a +/// degenerate stub. That means that the extra `bool` argument will be ignored +/// by the called kernel and the kernel will not return a dynamic result. +/// +/// This is a terrible idea, generally speaking. However, if the launcher +/// neither looks for nor attempts to use the second `bool` argument at all, and +/// the launcher will drop any results returned from the kernel (regardless of +/// type) on the floor anyway, then one may be able to get away with using a +/// degenerate kernel type. +inline KernelDegenerateType +make_degenerate_kernel_type(KernelThunkType func_type) { + return reinterpret_cast( + reinterpret_cast(func_type)); +} + +} // namespace cudaq diff --git a/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp b/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp index 1b8d0b1141..df8a89e6f4 100644 --- a/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp +++ b/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp @@ -33,11 +33,12 @@ class DefaultQPU : public cudaq::QPU { execution_queue->enqueue(task); } - void launchKernel(const std::string &name, void (*kernelFunc)(void *), - void *args, std::uint64_t, std::uint64_t, - const std::vector &rawArgs) override { + cudaq::KernelThunkResultType + launchKernel(const std::string &name, cudaq::KernelThunkType kernelFunc, + void *args, std::uint64_t argsSize, std::uint64_t resultOffset, + const std::vector &rawArgs) override { ScopedTraceWithContext(cudaq::TIMING_LAUNCH, "QPU::launchKernel"); - kernelFunc(args); + return kernelFunc(args, /*isRemote=*/false); } /// Overrides setExecutionContext to forward it to the ExecutionManager diff --git a/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp b/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp index 1243e9f480..38b26f2a98 100644 --- a/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp +++ b/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp @@ -37,12 +37,13 @@ class GPUEmulatedQPU : public cudaq::QPU { execution_queue->enqueue(task); } - void launchKernel(const std::string &name, void (*kernelFunc)(void *), - void *args, std::uint64_t, std::uint64_t, - const std::vector &rawArgs) override { + cudaq::KernelThunkResultType + launchKernel(const std::string &name, cudaq::KernelThunkType kernelFunc, + void *args, std::uint64_t, std::uint64_t, + const std::vector &rawArgs) override { cudaq::info("QPU::launchKernel GPU {}", qpu_id); cudaSetDevice(qpu_id); - kernelFunc(args); + return kernelFunc(args, /*differentMemorySpace=*/false); } /// Overrides setExecutionContext to forward it to the ExecutionManager diff --git a/runtime/cudaq/platform/orca/OrcaQPU.cpp b/runtime/cudaq/platform/orca/OrcaQPU.cpp index 63883a7af3..fdb053bf87 100644 --- a/runtime/cudaq/platform/orca/OrcaQPU.cpp +++ b/runtime/cudaq/platform/orca/OrcaQPU.cpp @@ -23,8 +23,8 @@ cudaq::sample_result runSampling(TBIParameters ¶meters, platform.set_exec_ctx(ctx.get(), qpu_id); platform.set_current_qpu(qpu_id); - cudaq::altLaunchKernel("orca_launch", nullptr, ¶meters, - sizeof(TBIParameters), 0); + [[maybe_unused]] auto dynamicResult = cudaq::altLaunchKernel( + "orca_launch", nullptr, ¶meters, sizeof(TBIParameters), 0); platform.reset_exec_ctx(qpu_id); return ctx->result; @@ -43,8 +43,8 @@ async_sample_result runAsyncSampling(TBIParameters ¶meters, platform.set_exec_ctx(ctx.get(), qpu_id); platform.set_current_qpu(qpu_id); - cudaq::altLaunchKernel("orca_launch", nullptr, ¶meters, - sizeof(TBIParameters), 0); + [[maybe_unused]] auto dynamicResult = cudaq::altLaunchKernel( + "orca_launch", nullptr, ¶meters, sizeof(TBIParameters), 0); // If we have a non-null future, set it futureResult = ctx->futureResult; diff --git a/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.cpp b/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.cpp index f834136fc4..1c63c92c2b 100644 --- a/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.cpp +++ b/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.cpp @@ -59,11 +59,10 @@ void OrcaRemoteRESTQPU::setTargetBackend(const std::string &backend) { } /// @brief Launch the experiment. -void OrcaRemoteRESTQPU::launchKernel(const std::string &kernelName, - void (*kernelFunc)(void *), void *args, - std::uint64_t voidStarSize, - std::uint64_t resultOffset, - const std::vector &rawArgs) { +KernelThunkResultType OrcaRemoteRESTQPU::launchKernel( + const std::string &kernelName, KernelThunkType kernelFunc, void *args, + std::uint64_t voidStarSize, std::uint64_t resultOffset, + const std::vector &rawArgs) { cudaq::info("OrcaRemoteRESTQPU: Launch kernel named '{}' remote QPU {}", kernelName, qpu_id); @@ -88,12 +87,15 @@ void OrcaRemoteRESTQPU::launchKernel(const std::string &kernelName, // Keep this asynchronous if requested if (ctx->asyncExec) { ctx->futureResult = future; - return; + return {}; } // Otherwise make this synchronous ctx->result = future.get(); + + // TODO: support dynamic result types. + return {}; } } // namespace cudaq -CUDAQ_REGISTER_TYPE(cudaq::QPU, cudaq::OrcaRemoteRESTQPU, orca) \ No newline at end of file +CUDAQ_REGISTER_TYPE(cudaq::QPU, cudaq::OrcaRemoteRESTQPU, orca) diff --git a/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.h b/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.h index 80d2df5726..d14a5f4e3c 100644 --- a/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.h +++ b/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.h @@ -120,10 +120,11 @@ class OrcaRemoteRESTQPU : public cudaq::QPU { /// @brief Launch the kernel. Handle all pertinent modifications for the /// execution context. - void launchKernel(const std::string &kernelName, void (*kernelFunc)(void *), - void *args, std::uint64_t voidStarSize, - std::uint64_t resultOffset, - const std::vector &rawArgs) override; + KernelThunkResultType + launchKernel(const std::string &kernelName, KernelThunkType kernelFunc, + void *args, std::uint64_t voidStarSize, + std::uint64_t resultOffset, + const std::vector &rawArgs) override; void launchKernel(const std::string &kernelName, const std::vector &rawArgs) override { throw std::runtime_error("launch kernel on raw args not implemented"); diff --git a/runtime/cudaq/platform/qpu.h b/runtime/cudaq/platform/qpu.h index 13a6d7da25..d104094a1e 100644 --- a/runtime/cudaq/platform/qpu.h +++ b/runtime/cudaq/platform/qpu.h @@ -11,12 +11,12 @@ #include "QuantumExecutionQueue.h" #include "common/Logger.h" #include "common/Registry.h" +#include "common/ThunkInterface.h" #include "common/Timing.h" #include "cudaq/qis/execution_manager.h" #include "cudaq/qis/qubit_qis.h" #include "cudaq/remote_capabilities.h" #include "cudaq/utils/cudaq_utils.h" - #include namespace cudaq { @@ -172,9 +172,10 @@ class QPU : public registry::RegisteredType { /// Launch the kernel with given name (to extract its Quake representation). /// The raw function pointer is also provided, as are the runtime arguments, /// as a struct-packed void pointer and its corresponding size. - virtual void launchKernel(const std::string &name, void (*kernelFunc)(void *), - void *args, std::uint64_t, std::uint64_t, - const std::vector &rawArgs) = 0; + [[nodiscard]] virtual KernelThunkResultType + launchKernel(const std::string &name, KernelThunkType kernelFunc, void *args, + std::uint64_t, std::uint64_t, + const std::vector &rawArgs) = 0; /// Launch the kernel with given name and argument arrays. // This is intended for remote QPUs whereby we need to JIT-compile the kernel diff --git a/runtime/cudaq/platform/quantum_platform.cpp b/runtime/cudaq/platform/quantum_platform.cpp index 00e259c389..46f248c690 100644 --- a/runtime/cudaq/platform/quantum_platform.cpp +++ b/runtime/cudaq/platform/quantum_platform.cpp @@ -30,11 +30,11 @@ namespace cudaq { std::string get_quake(const std::string &); static quantum_platform *platform; -inline static constexpr std::string_view GetQuantumPlatformSymbol = +static constexpr std::string_view GetQuantumPlatformSymbol = "getQuantumPlatform"; void setQuantumPlatformInternal(quantum_platform *p) { - cudaq::info("external caller setting the platform."); + info("external caller setting the platform."); platform = p; } @@ -43,8 +43,8 @@ void setQuantumPlatformInternal(quantum_platform *p) { quantum_platform *getQuantumPlatformInternal() { if (platform) return platform; - platform = cudaq::getUniquePluginInstance( - GetQuantumPlatformSymbol); + platform = + getUniquePluginInstance(GetQuantumPlatformSymbol); return platform; } @@ -94,8 +94,7 @@ std::size_t quantum_platform::get_current_qpu() { return platformCurrentQPU; } // Specify the execution context for this platform. // This delegates to the targeted QPU -void quantum_platform::set_exec_ctx(cudaq::ExecutionContext *ctx, - std::size_t qid) { +void quantum_platform::set_exec_ctx(ExecutionContext *ctx, std::size_t qid) { executionContext = ctx; auto &platformQPU = platformQPUs[qid]; platformQPU->setExecutionContext(ctx); @@ -130,9 +129,8 @@ bool quantum_platform::supports_conditional_feedback( } void quantum_platform::launchVQE(const std::string kernelName, - const void *kernelArgs, - cudaq::gradient *gradient, cudaq::spin_op H, - cudaq::optimizer &optimizer, + const void *kernelArgs, gradient *gradient, + spin_op H, optimizer &optimizer, const int n_params, const std::size_t shots) { std::size_t qpu_id = 0; @@ -151,11 +149,10 @@ quantum_platform::get_remote_capabilities(const std::size_t qpu_id) const { return platformQPUs[qpu_id]->getRemoteCapabilities(); } -void quantum_platform::launchKernel(std::string kernelName, - void (*kernelFunc)(void *), void *args, - std::uint64_t voidStarSize, - std::uint64_t resultOffset, - const std::vector &rawArgs) { +KernelThunkResultType quantum_platform::launchKernel( + std::string kernelName, KernelThunkType kernelFunc, void *args, + std::uint64_t voidStarSize, std::uint64_t resultOffset, + const std::vector &rawArgs) { std::size_t qpu_id = 0; auto tid = std::hash{}(std::this_thread::get_id()); @@ -164,8 +161,8 @@ void quantum_platform::launchKernel(std::string kernelName, qpu_id = iter->second; auto &qpu = platformQPUs[qpu_id]; - qpu->launchKernel(kernelName, kernelFunc, args, voidStarSize, resultOffset, - rawArgs); + return qpu->launchKernel(kernelName, kernelFunc, args, voidStarSize, + resultOffset, rawArgs); } void quantum_platform::launchKernel(std::string kernelName, @@ -183,7 +180,7 @@ void quantum_platform::launchKernel(std::string kernelName, void quantum_platform::launchSerializedCodeExecution( const std::string &name, - cudaq::SerializedCodeExecutionContext &serializeCodeExecutionObject) { + SerializedCodeExecutionContext &serializeCodeExecutionObject) { std::size_t qpu_id = 0; auto tid = std::hash{}(std::this_thread::get_id()); @@ -208,37 +205,46 @@ std::ostream *quantum_platform::getLogStream() { return platformLogStream; } void quantum_platform::setLogStream(std::ostream &logStream) { platformLogStream = &logStream; } -} // namespace cudaq -void cudaq::altLaunchKernel(const char *kernelName, void (*kernelFunc)(void *), - void *kernelArgs, std::uint64_t argsSize, - std::uint64_t resultOffset) { +KernelThunkResultType altLaunchKernel(const char *kernelName, + KernelThunkType kernelFunc, + void *kernelArgs, std::uint64_t argsSize, + std::uint64_t resultOffset) { ScopedTraceWithContext("altLaunchKernel", kernelName, argsSize); - auto &platform = *cudaq::getQuantumPlatformInternal(); + auto &platform = *getQuantumPlatformInternal(); std::string kernName = kernelName; - platform.launchKernel(kernName, kernelFunc, kernelArgs, argsSize, - resultOffset, {}); + return platform.launchKernel(kernName, kernelFunc, kernelArgs, argsSize, + resultOffset, {}); } -void cudaq::streamlinedLaunchKernel(const char *kernelName, - const std::vector &rawArgs) { +KernelThunkResultType +streamlinedLaunchKernel(const char *kernelName, + const std::vector &rawArgs) { std::size_t argsSize = rawArgs.size(); ScopedTraceWithContext("streamlinedLaunchKernel", kernelName, argsSize); - auto &platform = *cudaq::getQuantumPlatformInternal(); + auto &platform = *getQuantumPlatformInternal(); std::string kernName = kernelName; platform.launchKernel(kernName, rawArgs); + // NB: The streamlined launch will never return results. Use alt or hybrid if + // the kernel returns results. + return {}; } -void cudaq::hybridLaunchKernel(const char *kernelName, void (*kernel)(void *), - void *args, std::uint64_t argsSize, - std::uint64_t resultOffset, - const std::vector &rawArgs) { +KernelThunkResultType hybridLaunchKernel(const char *kernelName, + KernelThunkType kernel, void *args, + std::uint64_t argsSize, + std::uint64_t resultOffset, + const std::vector &rawArgs) { ScopedTraceWithContext("hybridLaunchKernel", kernelName); - auto &platform = *cudaq::getQuantumPlatformInternal(); + auto &platform = *getQuantumPlatformInternal(); const std::string kernName = kernelName; - if (platform.is_remote(platform.get_current_qpu())) + if (platform.is_remote(platform.get_current_qpu())) { + // This path should never call a kernel that returns results. platform.launchKernel(kernName, rawArgs); - else - platform.launchKernel(kernName, kernel, args, argsSize, resultOffset, - rawArgs); + return {}; + } + return platform.launchKernel(kernName, kernel, args, argsSize, resultOffset, + rawArgs); } + +} // namespace cudaq diff --git a/runtime/cudaq/platform/quantum_platform.h b/runtime/cudaq/platform/quantum_platform.h index e9598bf051..e16071890a 100644 --- a/runtime/cudaq/platform/quantum_platform.h +++ b/runtime/cudaq/platform/quantum_platform.h @@ -11,6 +11,7 @@ #include "common/ExecutionContext.h" #include "common/NoiseModel.h" #include "common/ObserveResult.h" +#include "common/ThunkInterface.h" #include "cudaq/remote_capabilities.h" #include "cudaq/utils/cudaq_utils.h" #include @@ -142,10 +143,10 @@ class quantum_platform { // This method is the hook for the kernel rewrites to invoke // quantum kernels. - void launchKernel(std::string kernelName, void (*kernelFunc)(void *), - void *args, std::uint64_t voidStarSize, - std::uint64_t resultOffset, - const std::vector &rawArgs); + [[nodiscard]] KernelThunkResultType + launchKernel(std::string kernelName, KernelThunkType kernelFunc, void *args, + std::uint64_t voidStarSize, std::uint64_t resultOffset, + const std::vector &rawArgs); void launchKernel(std::string kernelName, const std::vector &); // This method is the hook for executing SerializedCodeExecutionContext @@ -215,19 +216,21 @@ class quantum_platform { /// provide that information. extern "C" { // Client-server (legacy) interface. -void altLaunchKernel(const char *kernelName, void (*kernel)(void *), void *args, - std::uint64_t argsSize, std::uint64_t resultOffset); +[[nodiscard]] KernelThunkResultType +altLaunchKernel(const char *kernelName, KernelThunkType kernel, void *args, + std::uint64_t argsSize, std::uint64_t resultOffset); // Streamlined interface for launching kernels. Argument synthesis and JIT // compilation *must* happen on the local machine. -void streamlinedLaunchKernel(const char *kernelName, - const std::vector &rawArgs); +[[nodiscard]] KernelThunkResultType +streamlinedLaunchKernel(const char *kernelName, + const std::vector &rawArgs); // Hybrid of the client-server and streamlined approaches. Letting JIT // compilation happen either early or late and can handle return values from // each kernel launch. -void hybridLaunchKernel(const char *kernelName, void (*kernel)(void *), - void *args, std::uint64_t argsSize, - std::uint64_t resultOffset, - const std::vector &rawArgs); +[[nodiscard]] KernelThunkResultType +hybridLaunchKernel(const char *kernelName, KernelThunkType kernel, void *args, + std::uint64_t argsSize, std::uint64_t resultOffset, + const std::vector &rawArgs); } } // namespace cudaq diff --git a/runtime/cudaq/qis/remote_state.cpp b/runtime/cudaq/qis/remote_state.cpp index 713a462e46..97f907027a 100644 --- a/runtime/cudaq/qis/remote_state.cpp +++ b/runtime/cudaq/qis/remote_state.cpp @@ -184,7 +184,8 @@ RemoteSimulationState::overlap(const cudaq::SimulationState &other) { std::make_pair(static_cast(this), static_cast(&otherState)); platform.set_exec_ctx(&context); - platform.launchKernel(kernelName, nullptr, nullptr, 0, 0, {}); + [[maybe_unused]] auto dynamicResult = + platform.launchKernel(kernelName, nullptr, nullptr, 0, 0, {}); platform.reset_exec_ctx(); assert(context.overlapResult.has_value()); return context.overlapResult.value(); diff --git a/targettests/execution/vector_result.cpp b/targettests/execution/vector_result.cpp new file mode 100644 index 0000000000..ecc09fce5c --- /dev/null +++ b/targettests/execution/vector_result.cpp @@ -0,0 +1,52 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t + +#include "cudaq.h" +#include + +struct VectorBoolResult { + std::vector operator()() __qpu__ { + std::vector result(3); + result[0] = true; + result[1] = false; + result[2] = true; + return result; + } +}; + +struct VectorIntResult { + std::vector operator()() __qpu__ { + std::vector result(2); + result[0] = 42; + result[1] = -23479; + return result; + } +}; + +struct VectorDoubleResult { + std::vector operator()() __qpu__ { + std::vector result(2); + result[0] = 543.0; + result[1] = -234234.0; + return result; + } +}; + +int main() { + auto retb{VectorBoolResult{}()}; + printf("%d %d %d\n", static_cast(retb[0]), static_cast(retb[1]), + static_cast(retb[2])); + auto ret = VectorIntResult{}(); + printf("%d %d\n", ret[0], ret[1]); + std::vector retd{VectorDoubleResult{}()}; + printf("%f %f\n", retd[0], retd[1]); + return !(retb[0] && !retb[1] && retb[2] && ret[0] == 42 && ret[1] == -23479 && + retd[0] == 543.0 && retd[1] == -234234.0); +} diff --git a/test/Quake-QIR/argument.qke b/test/Quake-QIR/argument.qke index 61d737d5ce..6a3532805a 100644 --- a/test/Quake-QIR/argument.qke +++ b/test/Quake-QIR/argument.qke @@ -55,7 +55,7 @@ func.func @test_0(%0: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, !cc.ptr, !cc.ptr} // CHECK: %[[VAL_4:.*]] = bitcast { i32, { i1*, i64 } }* %[[VAL_3]] to i8* // CHECK: %[[VAL_5:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 0 // CHECK: store i32 %[[VAL_2]], i32* %[[VAL_5]], align 8 -// CHECK: call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_0.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_0.thunk to i8*), i8* nonnull %[[VAL_4]], i64 24, i64 8) -// CHECK: %[[VAL_6:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1, i32 0 -// CHECK: %[[VAL_7:.*]] = bitcast i1** %[[VAL_6]] to i8** -// CHECK: %[[VAL_8:.*]] = load i8*, i8** %[[VAL_7]], align 8 -// CHECK: %[[VAL_9:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1, i32 1 -// CHECK: %[[VAL_10:.*]] = load i64, i64* %[[VAL_9]], align 8 -// CHECK: %[[VAL_11:.*]] = bitcast { i8*, i8*, i8* }* %[[VAL_0]] to i8* -// CHECK: call void @__nvqpp_initializer_list_to_vector_bool(i8* %[[VAL_11]], i8* %[[VAL_8]], i64 %[[VAL_10]]) +// CHECK: %[[VAL_6:.*]] = call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_0.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_0.thunk to i8*), i8* nonnull %[[VAL_4]], i64 24, i64 8) +// CHECK: %[[VAL_7:.*]] = extractvalue { i8*, i64 } %[[VAL_6]], 0 +// CHECK: %[[VAL_8:.*]] = icmp eq i8* %[[VAL_7]], null +// CHECK: %[[VAL_9:.*]] = getelementptr i8, i8* %[[VAL_7]], i64 8 +// CHECK: %[[VAL_10:.*]] = bitcast i8* %[[VAL_9]] to { i1*, i64 }* +// CHECK: %[[VAL_11:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1 +// CHECK: %[[VAL_12:.*]] = select i1 %[[VAL_8]], { i1*, i64 }* %[[VAL_11]], { i1*, i64 }* %[[VAL_10]] +// CHECK: %[[VAL_13:.*]] = bitcast { i1*, i64 }* %[[VAL_12]] to i8** +// CHECK: %[[VAL_14:.*]] = load i8*, i8** %[[VAL_13]], align 8 +// CHECK: %[[VAL_15:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1, i32 1 +// CHECK: %[[VAL_16:.*]] = getelementptr i8, i8* %[[VAL_7]], i64 16 +// CHECK: %[[VAL_17:.*]] = bitcast i8* %[[VAL_16]] to i64* +// CHECK: %[[VAL_18:.*]] = select i1 %[[VAL_8]], i64* %[[VAL_15]], i64* %[[VAL_17]] +// CHECK: %[[VAL_19:.*]] = load i64, i64* %[[VAL_18]], align 4 +// CHECK: %[[VAL_20:.*]] = bitcast { i8*, i8*, i8* }* %[[VAL_0]] to i8* +// CHECK: call void @__nvqpp_initializer_list_to_vector_bool(i8* %[[VAL_20]], i8* %[[VAL_14]], i64 %[[VAL_19]]) +// CHECK: call void @free(i8* %[[VAL_7]]) // CHECK: ret void // CHECK: } @@ -169,7 +178,7 @@ func.func @test_1(%this: !cc.ptr) -> i16 { // CHECK-SAME: %[[VAL_0:.*]]) {{.*}}{ // CHECK-NEXT: %[[VAL_2:.*]] = alloca i16, align 8 // CHECK: %[[VAL_3:.*]] = bitcast i16* %[[VAL_2]] to i8* -// CHECK: call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_1.thunk to i8*), i8* nonnull %[[VAL_3]], i64 2, i64 0) +// CHECK: call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_1.thunk to i8*), i8* nonnull %[[VAL_3]], i64 2, i64 0) // CHECK: %[[VAL_4:.*]] = load i16, i16* %[[VAL_2]], align 8 // CHECK: ret i16 %[[VAL_4]] // CHECK: } @@ -200,7 +209,7 @@ func.func @test_2(%1: !cc.ptr> {llvm.sret = !cc // CHECK-SAME: %[[VAL_0:.*]], i8* nocapture readnone %[[VAL_1:.*]]) {{.*}}{ // CHECK: %[[VAL_2:.*]] = alloca { { i16, float, double, i64 } }, align 8 // CHECK: %[[VAL_3:.*]] = bitcast { { i16, float, double, i64 } }* %[[VAL_2]] to i8* -// CHECK: call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_2.thunk to i8*), i8* nonnull %[[VAL_3]], i64 24, i64 0) +// CHECK: call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_2.thunk to i8*), i8* nonnull %[[VAL_3]], i64 24, i64 0) // CHECK: %[[VAL_4:.*]] = bitcast { i16, float, double, i64 }* %[[VAL_0]] to i8* // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(24) %[[VAL_4]], i8* noundef nonnull align 8 dereferenceable(24) %[[VAL_3]], i64 24, i1 false) // CHECK: ret void @@ -234,7 +243,7 @@ func.func @test_3(%1: !cc.ptr> {llvm.sret = !cc.array> {llvm.sret = !cc.struct // CHECK-SAME: %[[VAL_0:.*]], i8* nocapture readnone %[[VAL_1:.*]]) {{.*}}{ // CHECK: %[[VAL_2:.*]] = alloca { i64, double }, align 8 // CHECK: %[[VAL_3:.*]] = bitcast { i64, double }* %[[VAL_2]] to i8* -// CHECK: call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_4.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_4.thunk to i8*), i8* nonnull %[[VAL_3]], i64 16, i64 0) +// CHECK: call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_4.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_4.thunk to i8*), i8* nonnull %[[VAL_3]], i64 16, i64 0) // CHECK: %[[VAL_4:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8* // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_4]], i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_3]], i64 16, i1 false) // CHECK: ret void @@ -283,7 +292,7 @@ func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct // CHECK-SAME: %[[VAL_0:.*]]) {{.*}}{ // CHECK: %[[VAL_1:.*]] = alloca { i64, double }, align 8 // CHECK: %[[VAL_2:.*]] = bitcast { i64, double }* %[[VAL_1]] to i8* -// CHECK: call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_5.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_5.thunk to i8*), i8* nonnull %[[VAL_2]], i64 16, i64 0) +// CHECK: call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_5.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_5.thunk to i8*), i8* nonnull %[[VAL_2]], i64 16, i64 0) // CHECK: %[[VAL_3:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8* // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_3]], i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_2]], i64 16, i1 false) // CHECK: ret void diff --git a/test/Quake/kernel_exec-1.qke b/test/Quake/kernel_exec-1.qke index 751ba66a10..37ac7c7229 100644 --- a/test/Quake/kernel_exec-1.qke +++ b/test/Quake/kernel_exec-1.qke @@ -90,25 +90,38 @@ module attributes {quake.mangled_name_map = { // CHECK-DAG: %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}> // CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : i64 // CHECK: %[[VAL_4:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{i32, f64}>, i32) -> !cc.struct<{i32, f64}> -// CHECK: %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 -// CHECK: %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_9:.*]] = cc.alloca i8[%[[VAL_8]] : i64] -// CHECK: %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_4]], %[[VAL_10]] : !cc.ptr> -// CHECK: %[[VAL_11:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr x ?>> -// CHECK: %[[VAL_13:.*]] = constant @ghz.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_15:.*]] = cc.func_ptr %[[VAL_13]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_16:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr x ?>>) -> !cc.ptr -// CHECK: %[[VAL_18:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64 -// CHECK: %[[VAL_12:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> -// CHECK: %[[VAL_14:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr>) -> !cc.ptr -// CHECK: call @altLaunchKernel(%[[VAL_14]], %[[VAL_15]], %[[VAL_16]], %[[VAL_8]], %[[VAL_18]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> () -// CHECK: %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_11]][0, 1] : (!cc.ptr x ?>>) -> !cc.ptr -// CHECK: %[[VAL_20:.*]] = cc.load %[[VAL_19]] : !cc.ptr -// CHECK: return %[[VAL_20]] : f64 +// CHECK: %[[VAL_5:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 +// CHECK: %[[VAL_6:.*]] = arith.addi %[[VAL_5]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_7:.*]] = cc.alloca i8{{\[}}%[[VAL_6]] : i64] +// CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_4]], %[[VAL_8]] : !cc.ptr> +// CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr>) -> !cc.ptr x ?>> +// CHECK: %[[VAL_10:.*]] = constant @ghz.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_11:.*]] = cc.func_ptr %[[VAL_10]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr x ?>>) -> !cc.ptr +// CHECK: %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64 +// CHECK: %[[VAL_14:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> +// CHECK: %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!llvm.ptr>) -> !cc.ptr +// CHECK: %[[VAL_16:.*]] = call @altLaunchKernel(%[[VAL_15]], %[[VAL_11]], %[[VAL_12]], %[[VAL_6]], %[[VAL_13]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_17:.*]] = cc.extract_value %[[VAL_16]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_19:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_20:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_19]] : i64 +// CHECK: cf.cond_br %[[VAL_20]], ^bb1, ^bb2 +// CHECK: ^bb1: +// CHECK: %[[VAL_21:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_21]][1] : (!cc.ptr>) -> !cc.ptr +// CHECK: cf.br ^bb3(%[[VAL_22]] : !cc.ptr) +// CHECK: ^bb2: +// CHECK: %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr x ?>>) -> !cc.ptr +// CHECK: cf.br ^bb3(%[[VAL_23]] : !cc.ptr) +// CHECK: ^bb3(%[[VAL_24:.*]]: !cc.ptr): +// CHECK: %[[VAL_25:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr x ?>>) -> !cc.ptr +// CHECK: %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr +// CHECK: return %[[VAL_26]] : f64 // CHECK: } -// CHECK: func.func private @altLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) +// CHECK: func.func private @altLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> // CHECK: func.func private @cudaqRegisterKernelName(!cc.ptr) // CHECK: llvm.mlir.global external constant @ghz.kernelName("ghz\00") {addr_space = 0 : i32} @@ -192,39 +205,52 @@ module attributes {quake.mangled_name_map = { // HYBRID: %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}> // HYBRID: %[[VAL_3:.*]] = arith.constant 0 : i64 // HYBRID: %[[VAL_4:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{i32, f64}>, i32) -> !cc.struct<{i32, f64}> -// HYBRID: %[[VAL_6:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 -// HYBRID: %[[VAL_7:.*]] = arith.addi %[[VAL_6]], %[[VAL_3]] : i64 -// HYBRID: %[[VAL_8:.*]] = cc.alloca i8{{\[}}%[[VAL_7]] : i64] -// HYBRID: %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr>) -> !cc.ptr> -// HYBRID: cc.store %[[VAL_4]], %[[VAL_9]] : !cc.ptr> -// HYBRID: %[[VAL_10:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr>) -> !cc.ptr x ?>> -// HYBRID: %[[VAL_11:.*]] = constant @ghz.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> -// HYBRID: %[[VAL_12:.*]] = cc.func_ptr %[[VAL_11]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// HYBRID: %[[VAL_13:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr x ?>>) -> !cc.ptr -// HYBRID: %[[VAL_15:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64 -// HYBRID: %[[VAL_16:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> -// HYBRID: %[[VAL_17:.*]] = cc.alloca !cc.array x 1> -// HYBRID: %[[VAL_18:.*]] = cc.sizeof !cc.array x 1> : i64 -// HYBRID: %[[VAL_19:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr x 1>>) -> !cc.ptr> -// HYBRID: %[[VAL_20:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// HYBRID: cc.store %[[VAL_19]], %[[VAL_20]] : !cc.ptr>> -// HYBRID: %[[VAL_21:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr x 1>>) -> i64 -// HYBRID: %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_18]] : i64 -// HYBRID: %[[VAL_23:.*]] = cc.cast %[[VAL_22]] : (i64) -> !cc.ptr> -// HYBRID: %[[VAL_24:.*]] = cc.compute_ptr %[[VAL_16]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// HYBRID: cc.store %[[VAL_23]], %[[VAL_24]] : !cc.ptr>> -// HYBRID: %[[VAL_25:.*]] = cc.compute_ptr %[[VAL_16]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// HYBRID: cc.store %[[VAL_23]], %[[VAL_25]] : !cc.ptr>> -// HYBRID: %[[VAL_26:.*]] = cc.compute_ptr %[[VAL_17]][0] : (!cc.ptr x 1>>) -> !cc.ptr> -// HYBRID: %[[VAL_27:.*]] = cc.alloca i32 -// HYBRID: cc.store %[[VAL_1]], %[[VAL_27]] : !cc.ptr -// HYBRID: %[[VAL_28:.*]] = cc.cast %[[VAL_27]] : (!cc.ptr) -> !cc.ptr -// HYBRID: cc.store %[[VAL_28]], %[[VAL_26]] : !cc.ptr> -// HYBRID: %[[VAL_29:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr -// HYBRID: %[[VAL_30:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> -// HYBRID: %[[VAL_31:.*]] = cc.cast %[[VAL_30]] : (!llvm.ptr>) -> !cc.ptr -// HYBRID: call @hybridLaunchKernel(%[[VAL_31]], %[[VAL_12]], %[[VAL_13]], %[[VAL_7]], %[[VAL_15]], %[[VAL_29]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> () -// HYBRID: %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_10]][0, 1] : (!cc.ptr x ?>>) -> !cc.ptr -// HYBRID: %[[VAL_33:.*]] = cc.load %[[VAL_32]] : !cc.ptr -// HYBRID: return %[[VAL_33]] : f64 +// HYBRID: %[[VAL_5:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 +// HYBRID: %[[VAL_6:.*]] = arith.addi %[[VAL_5]], %[[VAL_3]] : i64 +// HYBRID: %[[VAL_7:.*]] = cc.alloca i8{{\[}}%[[VAL_6]] : i64] +// HYBRID: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr>) -> !cc.ptr> +// HYBRID: cc.store %[[VAL_4]], %[[VAL_8]] : !cc.ptr> +// HYBRID: %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr>) -> !cc.ptr x ?>> +// HYBRID: %[[VAL_10:.*]] = constant @ghz.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// HYBRID: %[[VAL_11:.*]] = cc.func_ptr %[[VAL_10]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// HYBRID: %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr x ?>>) -> !cc.ptr +// HYBRID: %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64 +// HYBRID: %[[VAL_14:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> +// HYBRID: %[[VAL_15:.*]] = cc.alloca !cc.array x 1> +// HYBRID: %[[VAL_16:.*]] = cc.sizeof !cc.array x 1> : i64 +// HYBRID: %[[VAL_17:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr x 1>>) -> !cc.ptr> +// HYBRID: %[[VAL_18:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// HYBRID: cc.store %[[VAL_17]], %[[VAL_18]] : !cc.ptr>> +// HYBRID: %[[VAL_19:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr x 1>>) -> i64 +// HYBRID: %[[VAL_20:.*]] = arith.addi %[[VAL_19]], %[[VAL_16]] : i64 +// HYBRID: %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (i64) -> !cc.ptr> +// HYBRID: %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_14]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// HYBRID: cc.store %[[VAL_21]], %[[VAL_22]] : !cc.ptr>> +// HYBRID: %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_14]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// HYBRID: cc.store %[[VAL_21]], %[[VAL_23]] : !cc.ptr>> +// HYBRID: %[[VAL_24:.*]] = cc.compute_ptr %[[VAL_15]][0] : (!cc.ptr x 1>>) -> !cc.ptr> +// HYBRID: %[[VAL_25:.*]] = cc.alloca i32 +// HYBRID: cc.store %[[VAL_1]], %[[VAL_25]] : !cc.ptr +// HYBRID: %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr) -> !cc.ptr +// HYBRID: cc.store %[[VAL_26]], %[[VAL_24]] : !cc.ptr> +// HYBRID: %[[VAL_27:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr +// HYBRID: %[[VAL_28:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> +// HYBRID: %[[VAL_29:.*]] = cc.cast %[[VAL_28]] : (!llvm.ptr>) -> !cc.ptr +// HYBRID: %[[VAL_30:.*]] = call @hybridLaunchKernel(%[[VAL_29]], %[[VAL_11]], %[[VAL_12]], %[[VAL_6]], %[[VAL_13]], %[[VAL_27]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// HYBRID: %[[VAL_31:.*]] = cc.extract_value %[[VAL_30]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// HYBRID: %[[VAL_32:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr) -> i64 +// HYBRID: %[[VAL_33:.*]] = arith.constant 0 : i64 +// HYBRID: %[[VAL_34:.*]] = arith.cmpi ne, %[[VAL_32]], %[[VAL_33]] : i64 +// HYBRID: cf.cond_br %[[VAL_34]], ^bb1, ^bb2 +// HYBRID: ^bb1: +// HYBRID: %[[VAL_35:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr) -> !cc.ptr> +// HYBRID: %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_35]][1] : (!cc.ptr>) -> !cc.ptr +// HYBRID: cf.br ^bb3(%[[VAL_36]] : !cc.ptr) +// HYBRID: ^bb2: +// HYBRID: %[[VAL_37:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr x ?>>) -> !cc.ptr +// HYBRID: cf.br ^bb3(%[[VAL_37]] : !cc.ptr) +// HYBRID: ^bb3(%[[VAL_38:.*]]: !cc.ptr): +// HYBRID: %[[VAL_39:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr x ?>>) -> !cc.ptr +// HYBRID: %[[VAL_40:.*]] = cc.load %[[VAL_39]] : !cc.ptr +// HYBRID: return %[[VAL_40]] : f64 // HYBRID: } diff --git a/test/Quake/kernel_exec-2.qke b/test/Quake/kernel_exec-2.qke index 4e53513774..a9b04b8449 100644 --- a/test/Quake/kernel_exec-2.qke +++ b/test/Quake/kernel_exec-2.qke @@ -71,11 +71,11 @@ __nvqpp__mlirgen__function_cargo = "pants"}} { // CHECK: %[[VAL_33:.*]] = arith.constant 2147483647 : i64 // CHECK: %[[VAL_28:.*]] = llvm.mlir.addressof @function_hawaiian.kernelName : !llvm.ptr> // CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_28]] : (!llvm.ptr>) -> !cc.ptr -// CHECK: call @altLaunchKernel(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_17]], %[[VAL_33]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> () +// CHECK: call @altLaunchKernel(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_17]], %[[VAL_33]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> // CHECK: return // CHECK: } -// CHECK-DAG: func.func private @altLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) +// CHECK-DAG: func.func private @altLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> // CHECK-DAG: func.func private @cudaqRegisterKernelName(!cc.ptr) // CHECK-DAG: func.func private @cudaqRegisterArgsCreator(!cc.ptr, !cc.ptr) // CHECK-DAG: func.func private @malloc(i64) -> !cc.ptr diff --git a/test/Quake/return_vector.qke b/test/Quake/return_vector.qke index a13d0b6abe..90ccc90610 100644 --- a/test/Quake/return_vector.qke +++ b/test/Quake/return_vector.qke @@ -6,8 +6,8 @@ // the terms of the Apache License 2.0 which accompanies this distribution. // // ========================================================================== // -// RUN: cudaq-opt --add-dealloc --kernel-execution=codegen=1 --canonicalize %s | \ -// RUN: FileCheck %s +// RUN: cudaq-opt --add-dealloc --kernel-execution=codegen=1 --canonicalize %s \ +// RUN: | FileCheck %s // NB: the mangled name map is required for the kernel-execution pass. module attributes{ quake.mangled_name_map = { @@ -42,37 +42,48 @@ func.func @test_0(%0: !cc.ptr, !cc.ptr, !cc.ptr, %[[VAL_2:.*]]: i32) { // CHECK: %[[VAL_3:.*]] = arith.constant 4 : i64 // CHECK: %[[VAL_4:.*]] = constant @test_0.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_5:.*]] = arith.constant 0 : i64 // CHECK: %[[VAL_6:.*]] = cc.undef !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> // CHECK: %[[VAL_7:.*]] = cc.insert_value %[[VAL_2]], %[[VAL_6]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>, i32) -> !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> -// CHECK: %[[VAL_9:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 -// CHECK: %[[VAL_10:.*]] = cc.alloca i8{{\[}}%[[VAL_9]] : i64] -// CHECK: %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: cc.store %[[VAL_7]], %[[VAL_11]] : !cc.ptr, i64}>}>> -// CHECK: %[[VAL_14:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_15:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr -// CHECK: %[[VAL_17:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 -// CHECK: %[[VAL_12:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr> -// CHECK: %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr>) -> !cc.ptr -// CHECK: call @altLaunchKernel(%[[VAL_13]], %[[VAL_14]], %[[VAL_15]], %[[VAL_9]], %[[VAL_17]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> () -// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_18]][1, 0] : (!cc.ptr, i64}>}>>) -> !cc.ptr> -// CHECK: %[[VAL_20:.*]] = cc.load %[[VAL_19]] : !cc.ptr> -// CHECK: %[[VAL_21:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_21]][1, 1] : (!cc.ptr, i64}>}>>) -> !cc.ptr -// CHECK: %[[VAL_23:.*]] = cc.load %[[VAL_22]] : !cc.ptr -// CHECK: %[[VAL_24:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> -// CHECK: %[[VAL_25:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr> -// CHECK: call @free(%[[VAL_26]]) : (!cc.ptr) -> () -// CHECK: %[[VAL_27:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr) -> !cc.ptr -// CHECK: cc.store %[[VAL_27]], %[[VAL_25]] : !cc.ptr> -// CHECK: %[[VAL_28:.*]] = cc.compute_ptr %[[VAL_24]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_29:.*]] = arith.muli %[[VAL_23]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_31:.*]] = cc.compute_ptr %[[VAL_30]]{{\[}}%[[VAL_29]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: cc.store %[[VAL_31]], %[[VAL_28]] : !cc.ptr> -// CHECK: %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_24]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_31]], %[[VAL_32]] : !cc.ptr> +// CHECK: %[[VAL_8:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_9:.*]] = cc.alloca i8{{\[}}%[[VAL_8]] : i64] +// CHECK: %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> +// CHECK: cc.store %[[VAL_7]], %[[VAL_10]] : !cc.ptr, i64}>}>> +// CHECK: %[[VAL_11:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 +// CHECK: %[[VAL_14:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr> +// CHECK: %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!llvm.ptr>) -> !cc.ptr +// CHECK: %[[VAL_16:.*]] = call @altLaunchKernel(%[[VAL_15]], %[[VAL_11]], %[[VAL_12]], %[[VAL_8]], %[[VAL_13]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_17:.*]] = cc.extract_value %[[VAL_16]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_19:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_5]] : i64 +// CHECK: cf.cond_br %[[VAL_19]], ^bb1, ^bb2 +// CHECK: ^bb1: +// CHECK: %[[VAL_20:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_21:.*]] = cc.compute_ptr %[[VAL_20]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_21]] : !cc.ptr, i64}>>) +// CHECK: ^bb2: +// CHECK: %[[VAL_22:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_22]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_23]] : !cc.ptr, i64}>>) +// CHECK: ^bb3(%[[VAL_24:.*]]: !cc.ptr, i64}>>): +// CHECK: %[[VAL_25:.*]] = cc.cast %[[VAL_24]] : (!cc.ptr, i64}>>) -> !cc.ptr> +// CHECK: %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr> +// CHECK: %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_24]][1] : (!cc.ptr, i64}>>) -> !cc.ptr +// CHECK: %[[VAL_28:.*]] = cc.load %[[VAL_27]] : !cc.ptr +// CHECK: %[[VAL_29:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> +// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_31:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_31]], %[[VAL_30]] : !cc.ptr> +// CHECK: %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_29]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_33:.*]] = arith.muli %[[VAL_28]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_34:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_35:.*]] = cc.compute_ptr %[[VAL_34]]{{\[}}%[[VAL_33]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: cc.store %[[VAL_35]], %[[VAL_32]] : !cc.ptr> +// CHECK: %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_29]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_35]], %[[VAL_36]] : !cc.ptr> +// CHECK: call @free(%[[VAL_17]]) : (!cc.ptr) -> () // CHECK: return // CHECK: } @@ -102,37 +113,48 @@ func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr, %[[VAL_2:.*]]: i32) { // CHECK: %[[VAL_3:.*]] = arith.constant 8 : i64 // CHECK: %[[VAL_4:.*]] = constant @test_1.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_5:.*]] = arith.constant 0 : i64 // CHECK: %[[VAL_6:.*]] = cc.undef !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> // CHECK: %[[VAL_7:.*]] = cc.insert_value %[[VAL_2]], %[[VAL_6]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>, i32) -> !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> -// CHECK: %[[VAL_9:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 -// CHECK: %[[VAL_10:.*]] = cc.alloca i8{{\[}}%[[VAL_9]] : i64] -// CHECK: %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: cc.store %[[VAL_7]], %[[VAL_11]] : !cc.ptr, i64}>}>> -// CHECK: %[[VAL_14:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_15:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr -// CHECK: %[[VAL_17:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 -// CHECK: %[[VAL_12:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr> -// CHECK: %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr>) -> !cc.ptr -// CHECK: call @altLaunchKernel(%[[VAL_13]], %[[VAL_14]], %[[VAL_15]], %[[VAL_9]], %[[VAL_17]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> () -// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_18]][1, 0] : (!cc.ptr, i64}>}>>) -> !cc.ptr> -// CHECK: %[[VAL_20:.*]] = cc.load %[[VAL_19]] : !cc.ptr> -// CHECK: %[[VAL_21:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_21]][1, 1] : (!cc.ptr, i64}>}>>) -> !cc.ptr -// CHECK: %[[VAL_23:.*]] = cc.load %[[VAL_22]] : !cc.ptr -// CHECK: %[[VAL_24:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> -// CHECK: %[[VAL_25:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr> -// CHECK: call @free(%[[VAL_26]]) : (!cc.ptr) -> () -// CHECK: %[[VAL_27:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr) -> !cc.ptr -// CHECK: cc.store %[[VAL_27]], %[[VAL_25]] : !cc.ptr> -// CHECK: %[[VAL_28:.*]] = cc.compute_ptr %[[VAL_24]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_29:.*]] = arith.muli %[[VAL_23]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_31:.*]] = cc.compute_ptr %[[VAL_30]]{{\[}}%[[VAL_29]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: cc.store %[[VAL_31]], %[[VAL_28]] : !cc.ptr> -// CHECK: %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_24]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_31]], %[[VAL_32]] : !cc.ptr> +// CHECK: %[[VAL_8:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_9:.*]] = cc.alloca i8{{\[}}%[[VAL_8]] : i64] +// CHECK: %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> +// CHECK: cc.store %[[VAL_7]], %[[VAL_10]] : !cc.ptr, i64}>}>> +// CHECK: %[[VAL_11:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 +// CHECK: %[[VAL_14:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr> +// CHECK: %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!llvm.ptr>) -> !cc.ptr +// CHECK: %[[VAL_16:.*]] = call @altLaunchKernel(%[[VAL_15]], %[[VAL_11]], %[[VAL_12]], %[[VAL_8]], %[[VAL_13]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_17:.*]] = cc.extract_value %[[VAL_16]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_19:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_5]] : i64 +// CHECK: cf.cond_br %[[VAL_19]], ^bb1, ^bb2 +// CHECK: ^bb1: +// CHECK: %[[VAL_20:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_21:.*]] = cc.compute_ptr %[[VAL_20]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_21]] : !cc.ptr, i64}>>) +// CHECK: ^bb2: +// CHECK: %[[VAL_22:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_22]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_23]] : !cc.ptr, i64}>>) +// CHECK: ^bb3(%[[VAL_24:.*]]: !cc.ptr, i64}>>): +// CHECK: %[[VAL_25:.*]] = cc.cast %[[VAL_24]] : (!cc.ptr, i64}>>) -> !cc.ptr> +// CHECK: %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr> +// CHECK: %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_24]][1] : (!cc.ptr, i64}>>) -> !cc.ptr +// CHECK: %[[VAL_28:.*]] = cc.load %[[VAL_27]] : !cc.ptr +// CHECK: %[[VAL_29:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> +// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_31:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_31]], %[[VAL_30]] : !cc.ptr> +// CHECK: %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_29]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_33:.*]] = arith.muli %[[VAL_28]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_34:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_35:.*]] = cc.compute_ptr %[[VAL_34]]{{\[}}%[[VAL_33]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: cc.store %[[VAL_35]], %[[VAL_32]] : !cc.ptr> +// CHECK: %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_29]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_35]], %[[VAL_36]] : !cc.ptr> +// CHECK: call @free(%[[VAL_17]]) : (!cc.ptr) -> () // CHECK: return // CHECK: } @@ -140,40 +162,42 @@ func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr, %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { -// CHECK: %[[VAL_3:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr, i64}>}>> -// CHECK: %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 -// CHECK: %[[VAL_10:.*]] = cc.extract_value %[[VAL_4]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>) -> i32 -// CHECK: %[[VAL_15:.*]] = call @__nvqpp__mlirgen__test_0(%[[VAL_10]]) : (i32) -> !cc.stdvec +// CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr, i64}>}>> +// CHECK: %[[VAL_4:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_5:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>) -> i32 +// CHECK: %[[VAL_6:.*]] = call @__nvqpp__mlirgen__test_0(%[[VAL_5]]) : (i32) -> !cc.stdvec +// CHECK: %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr, i64}>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_6]], %[[VAL_8]] : !cc.ptr> // CHECK: cf.cond_br %[[VAL_1]], ^bb1, ^bb2 // CHECK: ^bb1: -// CHECK: %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_3]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}> -// CHECK: %[[VAL_16:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr, i64}>>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_15]], %[[VAL_16]] : !cc.ptr> -// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr, i64}>>) -> !cc.ptr, i64}>> -// CHECK: %[[VAL_13:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_7]], %[[VAL_12]]) : (!cc.ptr, i64, !cc.ptr, i64}>>) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: return %[[VAL_13]] : !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr, i64}>>) -> !cc.ptr, i64}>> +// CHECK: %[[VAL_10:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 +// CHECK: %[[VAL_11:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_4]], %[[VAL_9]], %[[VAL_10]]) : (!cc.ptr, i64, !cc.ptr, i64}>>, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: return %[[VAL_11]] : !cc.struct<{!cc.ptr, i64}> // CHECK: ^bb2: -// CHECK: %[[VAL_14:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> -// CHECK: return %[[VAL_14]] : !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_12:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> +// CHECK: return %[[VAL_12]] : !cc.struct<{!cc.ptr, i64}> // CHECK: } // CHECK-LABEL: func.func @test_1.thunk( // CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { -// CHECK: %[[VAL_3:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr, i64}>}>> -// CHECK: %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 -// CHECK: %[[VAL_10:.*]] = cc.extract_value %[[VAL_4]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>) -> i32 -// CHECK: %[[VAL_15:.*]] = call @__nvqpp__mlirgen__test_1(%[[VAL_10]]) : (i32) -> !cc.stdvec +// CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr, i64}>}>> +// CHECK: %[[VAL_4:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_5:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>) -> i32 +// CHECK: %[[VAL_6:.*]] = call @__nvqpp__mlirgen__test_1(%[[VAL_5]]) : (i32) -> !cc.stdvec +// CHECK: %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr, i64}>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_6]], %[[VAL_8]] : !cc.ptr> // CHECK: cf.cond_br %[[VAL_1]], ^bb1, ^bb2 // CHECK: ^bb1: -// CHECK: %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_3]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> -// CHECK: %[[VAL_16:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr, i64}>>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_15]], %[[VAL_16]] : !cc.ptr> -// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr, i64}>>) -> !cc.ptr, i64}>> -// CHECK: %[[VAL_13:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_7]], %[[VAL_12]]) : (!cc.ptr, i64, !cc.ptr, i64}>>) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: return %[[VAL_13]] : !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr, i64}>>) -> !cc.ptr, i64}>> +// CHECK: %[[VAL_10:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 +// CHECK: %[[VAL_11:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_4]], %[[VAL_9]], %[[VAL_10]]) : (!cc.ptr, i64, !cc.ptr, i64}>>, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: return %[[VAL_11]] : !cc.struct<{!cc.ptr, i64}> // CHECK: ^bb2: -// CHECK: %[[VAL_14:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> -// CHECK: return %[[VAL_14]] : !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_12:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> +// CHECK: return %[[VAL_12]] : !cc.struct<{!cc.ptr, i64}> // CHECK: }