diff --git a/docker/build/assets.Dockerfile b/docker/build/assets.Dockerfile index f7d34e9268..73e20627ad 100644 --- a/docker/build/assets.Dockerfile +++ b/docker/build/assets.Dockerfile @@ -286,6 +286,7 @@ RUN cd /cuda-quantum && source scripts/configure_build.sh && \ # The tests is marked correctly as requiring nvcc, but since nvcc # is available during the build we need to filter it manually. filtered=" --filter-out MixedLanguage/cuda-1"; \ + filtered+="|AST-Quake/calling_convention"; \ fi && \ "$LLVM_INSTALL_PREFIX/bin/llvm-lit" -v build/test \ --param nvqpp_site_config=build/test/lit.site.cfg.py ${filtered} && \ diff --git a/include/cudaq/Optimizer/Builder/Factory.h b/include/cudaq/Optimizer/Builder/Factory.h index 868cf4c861..24e933117a 100644 --- a/include/cudaq/Optimizer/Builder/Factory.h +++ b/include/cudaq/Optimizer/Builder/Factory.h @@ -236,6 +236,10 @@ createMonotonicLoop(mlir::OpBuilder &builder, mlir::Location loc, bool hasHiddenSRet(mlir::FunctionType funcTy); +/// Check a function to see if argument 0 has the `sret` attribute. Typically, +/// one may find this on a host-side entry point function. +bool hasSRet(mlir::func::FuncOp funcOp); + /// Convert the function type \p funcTy to a signature compatible with the code /// on the host side. This will add hidden arguments, such as the `this` /// pointer, convert some results to `sret` pointers, etc. @@ -251,7 +255,8 @@ bool isX86_64(mlir::ModuleOp); bool isAArch64(mlir::ModuleOp); /// A small structure may be passed as two arguments on the host side. (e.g., on -/// the X86-64 ABI.) If \p ty is not a `struct`, this returns `false`. +/// the X86-64 ABI.) If \p ty is not a `struct`, this returns `false`. Note +/// also, some small structs may be packed into a single register. bool structUsesTwoArguments(mlir::Type ty); std::optional getIntIfConstant(mlir::Value value); diff --git a/lib/Optimizer/Builder/Factory.cpp b/lib/Optimizer/Builder/Factory.cpp index 1c14ec349a..73b66cdac3 100644 --- a/lib/Optimizer/Builder/Factory.cpp +++ b/lib/Optimizer/Builder/Factory.cpp @@ -18,6 +18,9 @@ using namespace mlir; namespace cudaq::opt { +// The common small struct limit for architectures cudaq is supporting. +static constexpr unsigned CommonSmallStructSize = 128; + bool factory::isX86_64(ModuleOp module) { std::string triple; if (auto ta = module->getAttr(targetTripleAttrName)) @@ -302,33 +305,6 @@ cc::LoopOp factory::createMonotonicLoop( return loop; } -// FIXME: some ABIs may return a small struct in registers rather than via an -// sret pointer. -// -// On x86_64, -// pair of: argument return value packed from msb to lsb -// i32 : i64 i64 (second, first) -// i64 : i64, i64 { i64, i64 } -// f32 : <2 x float> <2 x float> -// f64 : double, double { double, double } -// -// On aarch64, -// pair of: argument return value packed from msb to lsb -// i32 : i64 i64 (second, first) -// i64 : [2 x i64] [2 x i64] -// f32 : [2 x float] { float, float } -// f64 : [2 x double] { double, double } -bool factory::hasHiddenSRet(FunctionType funcTy) { - // If a function has more than 1 result, the results are promoted to a - // structured return argument. Otherwise, if there is 1 result and it is an - // aggregate type, then it is promoted to a structured return argument. - auto numResults = funcTy.getNumResults(); - return numResults > 1 || - (numResults == 1 && funcTy.getResult(0) - .isa()); -} - cc::StructType factory::stlStringType(MLIRContext *ctx) { auto i8Ty = IntegerType::get(ctx, 8); auto ptrI8Ty = cc::PointerType::get(i8Ty); @@ -361,8 +337,8 @@ Type factory::getSRetElementType(FunctionType funcTy) { auto *ctx = funcTy.getContext(); if (funcTy.getNumResults() > 1) return cc::StructType::get(ctx, funcTy.getResults()); - if (isa(funcTy.getResult(0))) - return getDynamicBufferType(ctx); + if (auto spanTy = dyn_cast(funcTy.getResult(0))) + return stlVectorType(spanTy.getElementType()); return funcTy.getResult(0); } @@ -403,33 +379,49 @@ static Type convertToHostSideType(Type ty) { // function tries to simulate GCC argument passing conventions. classify() also // has a number of FIXME comments, where it diverges from the referenced ABI. // Empirical evidence show that on x86_64, integers and floats are packed in -// integers of size 32 or 64 together, unless the float member fits by itself. +// integers of size 8, 16, 24, 32 or 64 together, unless the float member fits +// by itself. static bool shouldExpand(SmallVectorImpl &packedTys, cc::StructType structTy) { if (structTy.isEmpty()) return false; auto *ctx = structTy.getContext(); unsigned bits = 0; + auto scaleBits = [&](unsigned size) { + if (size < 32) + size = (size + 7) & ~7u; + if (size > 32 && size <= 64) + size = 64; + return size; + }; // First split the members into a "lo" set and a "hi" set. SmallVector set1; SmallVector set2; for (auto ty : structTy.getMembers()) { if (auto intTy = dyn_cast(ty)) { - bits += intTy.getWidth(); - if (bits <= 64) + auto addBits = scaleBits(intTy.getWidth()); + if (bits + addBits <= 64) { + bits += addBits; set1.push_back(ty); - else + } else { + bits = std::max(bits, 64u) + addBits; set2.push_back(ty); + } } else if (auto fltTy = dyn_cast(ty)) { - bits += fltTy.getWidth(); - if (bits <= 64) + auto addBits = fltTy.getWidth(); + if (bits + addBits <= 64) { + bits += addBits; set1.push_back(ty); - else + } else { + bits = std::max(bits, 64u) + addBits; set2.push_back(ty); + } } else { return false; } + if (bits > CommonSmallStructSize) + return false; } // Process the sets. If the set has anything integral, use integer. If the set @@ -441,12 +433,23 @@ static bool shouldExpand(SmallVectorImpl &packedTys, return true; return false; }; + auto intSetSize = [&](auto theSet) { + unsigned size = 0; + for (auto ty : theSet) + size += scaleBits(ty.getIntOrFloatBitWidth()); + return size; + }; auto processMembers = [&](auto theSet, unsigned packIdx) { if (useInt(theSet)) { - packedTys[packIdx] = IntegerType::get(ctx, bits > 32 ? 64 : 32); + auto size = intSetSize(theSet); + if (size <= 32) + packedTys[packIdx] = IntegerType::get(ctx, size); + else + packedTys[packIdx] = IntegerType::get(ctx, 64); } else if (theSet.size() == 1) { packedTys[packIdx] = theSet[0]; } else { + assert(theSet[0] == FloatType::getF32(ctx) && "must be float"); packedTys[packIdx] = VectorType::get(ArrayRef{2}, theSet[0]); } @@ -454,15 +457,59 @@ static bool shouldExpand(SmallVectorImpl &packedTys, assert(!set1.empty() && "struct must have members"); packedTys.resize(set2.empty() ? 1 : 2); processMembers(set1, 0); - if (!set2.empty()) - processMembers(set2, 1); + if (set2.empty()) + return false; + processMembers(set2, 1); return true; } +bool factory::hasSRet(func::FuncOp funcOp) { + if (funcOp.getNumArguments() > 0) + if (auto dict = funcOp.getArgAttrDict(0)) + return dict.contains(LLVM::LLVMDialect::getStructRetAttrName()); + return false; +} + +// On x86_64, +// pair of: argument return value packed from msb to lsb +// i32 : i64 i64 (second, first) +// i64 : i64, i64 { i64, i64 } +// f32 : <2 x float> <2 x float> +// f64 : double, double { double, double } +// ptr : ptr, ptr { ptr, ptr } +// +// On aarch64, +// pair of: argument return value packed from msb to lsb +// i32 : i64 i64 (second, first) +// i64 : [2 x i64] [2 x i64] +// f32 : [2 x float] { float, float } +// f64 : [2 x double] { double, double } +// ptr : [2 x i64] [2 x i64] +bool factory::hasHiddenSRet(FunctionType funcTy) { + // If a function has more than 1 result, the results are promoted to a + // structured return argument. Otherwise, if there is 1 result and it is an + // aggregate type, then it is promoted to a structured return argument. + auto numResults = funcTy.getNumResults(); + if (numResults == 0) + return false; + if (numResults > 1) + return true; + auto resTy = funcTy.getResult(0); + if (resTy.isa()) + return true; + if (auto strTy = dyn_cast(resTy)) { + SmallVector packedTys; + bool inRegisters = shouldExpand(packedTys, strTy) || !packedTys.empty(); + return !inRegisters; + } + return false; +} + bool factory::structUsesTwoArguments(mlir::Type ty) { // Unchecked! This is only valid if target is X86-64. auto structTy = dyn_cast(ty); - if (!structTy || structTy.getBitSize() == 0 || structTy.getBitSize() > 128) + if (!structTy || structTy.getBitSize() == 0 || + structTy.getBitSize() > CommonSmallStructSize) return false; SmallVector unused; return shouldExpand(unused, structTy); @@ -486,14 +533,32 @@ FunctionType factory::toHostSideFuncType(FunctionType funcTy, bool addThisPtr, auto *ctx = funcTy.getContext(); SmallVector inputTys; bool hasSRet = false; - if (factory::hasHiddenSRet(funcTy)) { - // When the kernel is returning a std::vector result, the result is - // returned via a sret argument in the first position. When this argument - // is added, the this pointer becomes the second argument. Both are opaque - // pointers at this point. - auto eleTy = convertToHostSideType(getSRetElementType(funcTy)); - inputTys.push_back(cc::PointerType::get(eleTy)); - hasSRet = true; + Type resultTy; + if (funcTy.getNumResults() == 1) + if (auto strTy = dyn_cast(funcTy.getResult(0))) + if (strTy.getBitSize() != 0 && + strTy.getBitSize() <= CommonSmallStructSize) { + SmallVector packedTys; + if (shouldExpand(packedTys, strTy) || !packedTys.empty()) { + if (packedTys.size() == 1) + resultTy = packedTys[0]; + else + resultTy = cc::StructType::get(ctx, packedTys); + } + } + if (!resultTy && funcTy.getNumResults()) { + if (factory::hasHiddenSRet(funcTy)) { + // When the kernel is returning a std::vector result, the result is + // returned via a sret argument in the first position. When this argument + // is added, the this pointer becomes the second argument. Both are opaque + // pointers at this point. + auto eleTy = convertToHostSideType(getSRetElementType(funcTy)); + inputTys.push_back(cc::PointerType::get(eleTy)); + hasSRet = true; + } else { + assert(funcTy.getNumResults() == 1); + resultTy = funcTy.getResult(0); + } } // If this kernel is a plain old function or a static member function, we // don't want to add a hidden `this` argument. @@ -509,20 +574,25 @@ FunctionType factory::toHostSideFuncType(FunctionType funcTy, bool addThisPtr, // On x86_64 and aarch64, a struct that is smaller than 128 bits may be // passed in registers as separate arguments. See classifyArgumentType() // in CodeGen/TargetInfo.cpp. - if (strTy.getBitSize() != 0 && strTy.getBitSize() <= 128) { + if (strTy.getBitSize() != 0 && + strTy.getBitSize() <= CommonSmallStructSize) { if (isX86_64(module)) { SmallVector packedTys; if (shouldExpand(packedTys, strTy)) { for (auto ty : packedTys) inputTys.push_back(ty); continue; + } else if (!packedTys.empty()) { + for (auto ty : packedTys) + inputTys.push_back(ty); + continue; } } else { assert(isAArch64(module) && "aarch64 expected"); if (onlyArithmeticMembers(strTy)) { // Empirical evidence shows that on aarch64, arguments are packed - // into a single i64 or a [2 x i64] typed value based on the size of - // the struct. This is regardless of whether the value(s) are + // into a single i64 or a [2 x i64] typed value based on the size + // of the struct. This is regardless of whether the value(s) are // floating-point or not. if (strTy.getBitSize() > 64) inputTys.push_back(cc::ArrayType::get(ctx, i64Ty, 2)); @@ -542,8 +612,8 @@ FunctionType factory::toHostSideFuncType(FunctionType funcTy, bool addThisPtr, // and it hasn't been converted to a hidden sret argument. if (funcTy.getNumResults() == 0 || hasSRet) return FunctionType::get(ctx, inputTys, {}); - assert(funcTy.getNumResults() == 1); - return FunctionType::get(ctx, inputTys, funcTy.getResults()); + assert(funcTy.getNumResults() == 1 && resultTy); + return FunctionType::get(ctx, inputTys, resultTy); } bool factory::isStdVecArg(Type type) { diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp index 7d693921f1..a4667ce7b5 100644 --- a/lib/Optimizer/Transforms/GenKernelExecution.cpp +++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp @@ -251,7 +251,7 @@ class GenerateKernelExecution builder, loc, cudaq::cc::PointerType::get(i8Ty), fromBuff); builder.create( loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, - SmallVector{outputBuffer, vecFromBuff, bytes, notVolatile}); + ValueRange{outputBuffer, vecFromBuff, bytes, notVolatile}); auto i8ArrTy = cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(i8Ty)); auto buf1 = cudaq::opt::factory::createCast(builder, loc, i8ArrTy, outputBuffer); @@ -538,80 +538,6 @@ class GenerateKernelExecution return argsCreatorFunc; } - /// If the kernel has an sret argument, then we rewrite the kernel's signature - /// on the target. Note that this requires that the target has the ability to - /// pass stack pointers as function arguments. These stack pointers will - /// obviously only necessarily be valid to the target executing the kernel. - void updateQPUKernelAsSRet(OpBuilder &builder, func::FuncOp funcOp, - FunctionType newFuncTy) { - auto funcTy = funcOp.getFunctionType(); - // We add exactly 1 sret argument regardless of how many fields are folded - // into it. - assert(newFuncTy.getNumInputs() == funcTy.getNumInputs() + 1 && - "sret should be a single argument"); - auto *ctx = funcOp.getContext(); - auto eleTy = cudaq::opt::factory::getSRetElementType(funcTy); - NamedAttrList attrs; - attrs.set(LLVM::LLVMDialect::getStructRetAttrName(), TypeAttr::get(eleTy)); - funcOp.insertArgument(0, newFuncTy.getInput(0), attrs.getDictionary(ctx), - funcOp.getLoc()); - auto elePtrTy = cudaq::cc::PointerType::get(eleTy); - OpBuilder::InsertionGuard guard(builder); - SmallVector returnsToErase; - // Update all func.return to store values to the sret block. - funcOp->walk([&](func::ReturnOp retOp) { - auto loc = retOp.getLoc(); - builder.setInsertionPoint(retOp); - auto cast = builder.create(loc, elePtrTy, - funcOp.getArgument(0)); - if (funcOp.getNumResults() > 1) { - for (int i = 0, end = funcOp.getNumResults(); i != end; ++i) { - auto mem = builder.create( - loc, cudaq::cc::PointerType::get(funcTy.getResult(i)), cast, - SmallVector{i}); - builder.create(loc, retOp.getOperands()[i], mem); - } - } else if (auto stdvecTy = - dyn_cast(funcTy.getResult(0))) { - auto stdvec = retOp.getOperands()[0]; - auto eleTy = [&]() -> Type { - // TODO: Fold this conversion into the StdvecDataOp builder. We will - // never get a data buffer which is not byte addressable and where - // the width is less than 8. - if (auto intTy = dyn_cast(stdvecTy.getElementType())) - if (intTy.getWidth() < 8) - return builder.getI8Type(); - return stdvecTy.getElementType(); - }(); - auto i8Ty = cudaq::cc::PointerType::get(builder.getI8Type()); - auto ptrTy = cudaq::cc::PointerType::get(eleTy); - auto data = builder.create(loc, ptrTy, stdvec); - auto mem0 = builder.create( - loc, cudaq::cc::PointerType::get(i8Ty), cast, - SmallVector{0}); - auto mem1 = builder.create( - loc, cudaq::cc::PointerType::get(ptrTy), mem0); - builder.create(loc, data, mem1); - auto i64Ty = builder.getI64Type(); - auto size = builder.create(loc, i64Ty, stdvec); - auto mem2 = builder.create( - loc, cudaq::cc::PointerType::get(i64Ty), cast, - SmallVector{1}); - builder.create(loc, size, mem2); - } else { - builder.create(loc, retOp.getOperands()[0], cast); - } - builder.create(loc); - returnsToErase.push_back(retOp); - }); - for (auto *op : returnsToErase) - op->erase(); - for (std::size_t i = 0, end = funcOp.getNumResults(); i != end; ++i) - funcOp.eraseResult(0); - modifiedDevKernels.insert( - std::pair{funcOp.getName(), newFuncTy.getInput(0)}); - } - /// In the thunk, we need to unpack any `std::vector` objects encoded in the /// packet. Since these have dynamic size, they are encoded as trailing bytes /// by offset and size. The offset is implicit from the values of the @@ -821,58 +747,23 @@ class GenerateKernelExecution // Unpack the arguments in the struct and build the argument list for // the call to the kernel code. SmallVector args; - const bool hiddenSRet = cudaq::opt::factory::hasHiddenSRet(funcTy); - FunctionType newFuncTy = [&]() { - if (hiddenSRet) { - auto sretPtrTy = cudaq::cc::PointerType::get( - cudaq::opt::factory::getSRetElementType(funcTy)); - SmallVector inputTys = {sretPtrTy}; - inputTys.append(funcTy.getInputs().begin(), funcTy.getInputs().end()); - return FunctionType::get(ctx, inputTys, {}); - } - return funcTy; - }(); - int offset = funcTy.getNumInputs(); - if (hiddenSRet) { - // Use the end of the argument block for the return values. - auto eleTy = structTy.getMember(offset); - auto mem = builder.create( - loc, cudaq::cc::PointerType::get(eleTy), castOp, - SmallVector{offset}); - auto sretPtrTy = cudaq::cc::PointerType::get( - cudaq::opt::factory::getSRetElementType(funcTy)); - auto sretMem = builder.create(loc, sretPtrTy, mem); - args.push_back(sretMem); - - // Rewrite the original kernel's signature and return op(s). - updateQPUKernelAsSRet(builder, funcOp, newFuncTy); - } + const std::int32_t offset = funcTy.getNumInputs(); for (auto inp : llvm::enumerate(funcTy.getInputs())) { auto [a, t] = processInputValue(loc, builder, trailingData, val, inp.value(), inp.index(), structTy); trailingData = t; args.push_back(a); } - auto call = builder.create(loc, newFuncTy.getResults(), + auto call = builder.create(loc, funcTy.getResults(), funcOp.getName(), args); - // If and only if the kernel returns non-sret results, then take those - // values and store them in the results section of the struct. They will - // eventually be returned to the original caller. - if (!hiddenSRet && funcTy.getNumResults() == 1) { - auto eleTy = structTy.getMember(offset); - auto mem = builder.create( - loc, cudaq::cc::PointerType::get(eleTy), castOp, - SmallVector{offset}); - builder.create(loc, call.getResult(0), mem); - } - - // If the original result was a std::vector, then depending on whether - // this is client-server or not, the thunk function packs the dynamic return - // data into a message buffer or just returns a pointer to the shared heap - // allocation, resp. - bool hasVectorResult = funcTy.getNumResults() == 1 && - isa(funcTy.getResult(0)); + const bool hasVectorResult = + funcTy.getNumResults() == 1 && + isa(funcTy.getResult(0)); if (hasVectorResult) { + // If the original result was a std::vector, then depending on whether + // this is client-server or not, the thunk function packs the dynamic + // return data into a message buffer or just returns a pointer to the + // shared heap allocation, resp. auto *currentBlock = builder.getBlock(); auto *reg = currentBlock->getParent(); auto *thenBlock = builder.createBlock(reg); @@ -881,23 +772,53 @@ class GenerateKernelExecution builder.create(loc, isClientServer, thenBlock, elseBlock); builder.setInsertionPointToEnd(thenBlock); - int offset = funcTy.getNumInputs(); auto gepRes = builder.create( loc, cudaq::cc::PointerType::get(structTy.getMember(offset)), castOp, - SmallVector{offset}); - auto gepRes2 = builder.create( + ArrayRef{offset}); + auto resAsVec = builder.create( + loc, cudaq::cc::PointerType::get(funcTy.getResult(0)), gepRes); + builder.create(loc, call.getResult(0), resAsVec); + auto resAsArg = builder.create( loc, cudaq::cc::PointerType::get(thunkTy.getResults()[0]), gepRes); // createDynamicResult packs the input values and the dynamic results // into a single buffer to pass back as a message. auto res = builder.create( loc, thunkTy.getResults()[0], "__nvqpp_createDynamicResult", - ValueRange{thunkEntry->getArgument(0), structSize, gepRes2}); + ValueRange{thunkEntry->getArgument(0), structSize, resAsArg}); builder.create(loc, res.getResult(0)); builder.setInsertionPointToEnd(elseBlock); + auto eleTy = structTy.getMember(offset); + auto memTy = cudaq::cc::PointerType::get(eleTy); + auto mem = builder.create( + loc, memTy, castOp, SmallVector{offset}); + auto resPtrTy = cudaq::cc::PointerType::get(call.getResult(0).getType()); + auto castMem = builder.create(loc, resPtrTy, mem); + builder.create(loc, call.getResult(0), castMem); + } else { + // FIXME: Should check for recursive vector case. + // If the kernel returns non-dynamic results (no spans), then take those + // values and store them in the results section of the struct. They will + // eventually be returned to the original caller. + if (funcTy.getNumResults()) { + for (std::int32_t o = 0; + o < static_cast(funcTy.getNumResults()); ++o) { + auto eleTy = structTy.getMember(offset + o); + auto memTy = cudaq::cc::PointerType::get(eleTy); + auto mem = builder.create( + loc, memTy, castOp, + SmallVector{offset + o}); + auto resTy = call.getResult(o).getType(); + auto resPtrTy = cudaq::cc::PointerType::get(resTy); + Value castMem = mem; + if (resPtrTy != mem.getType()) + castMem = builder.create(loc, resPtrTy, mem); + builder.create(loc, call.getResult(o), castMem); + } + } } // zeroDynamicResult is used by models other than client-server. It assumes - // that no messages need to be sent, the CPU and QPU code share a memory - // space, and therefore skips making any copies. + // that no messages need to be sent and that the CPU and QPU code share a + // memory space. Therefore, making any copies can be skipped. auto zeroRes = builder.create(loc, thunkTy.getResults()[0], "__nvqpp_zeroDynamicResult", ValueRange{}); @@ -1125,11 +1046,10 @@ class GenerateKernelExecution func::FuncOp thunkFunc) { auto *ctx = builder.getContext(); auto i64Ty = builder.getI64Type(); - auto offset = devFuncTy.getNumInputs(); + std::int32_t offset = devFuncTy.getNumInputs(); auto thunkTy = getThunkType(ctx); auto structPtrTy = cudaq::cc::PointerType::get(structTy); Block *hostFuncEntryBlock = hostFunc.addEntryBlock(); - const bool hiddenSRet = cudaq::opt::factory::hasHiddenSRet(devFuncTy); OpBuilder::InsertionGuard guard(builder); builder.setInsertionPointToStart(hostFuncEntryBlock); @@ -1170,7 +1090,7 @@ class GenerateKernelExecution // launch kernel. if (isa(quakeTy)) { auto kernKey = builder.create( - loc, builder.getI64Type(), cudaq::runtime::getLinkableKernelKey, + loc, i64Ty, cudaq::runtime::getLinkableKernelKey, ValueRange{arg}); stVal = builder.create( loc, stVal.getType(), stVal, kernKey.getResult(0), idx); @@ -1308,8 +1228,8 @@ class GenerateKernelExecution std::int32_t idx = inp.index(); Type quakeTy = devFuncTy.getInput(idx); if (auto stdvecTy = dyn_cast(quakeTy)) { - auto bytes = builder.create( - loc, builder.getI64Type(), stVal, idx); + auto bytes = builder.create(loc, i64Ty, + stVal, idx); assert(stdvecTy == devFuncTy.getInput(idx)); auto ptrInTy = cast(inTy); vecToBuffer = encodeVectorData(loc, builder, bytes, stdvecTy, arg, @@ -1351,7 +1271,6 @@ class GenerateKernelExecution loc, cudaq::opt::factory::stlVectorType(ptrI8Ty)); auto arrPtrTy = cudaq::cc::ArrayType::get(ctx, ptrI8Ty, count); Value buffer = builder.create(loc, arrPtrTy); - auto i64Ty = builder.getI64Type(); auto buffSize = builder.create(loc, i64Ty, arrPtrTy); auto ptrPtrTy = cudaq::cc::PointerType::get(ptrI8Ty); auto cast1 = builder.create(loc, ptrPtrTy, buffer); @@ -1458,61 +1377,76 @@ class GenerateKernelExecution // result value(s) from the struct returned by `launchKernel` and return // them to our caller. SmallVector results; - const bool multiResult = devFuncTy.getResults().size() > 1; - for (auto res : llvm::enumerate(devFuncTy.getResults())) { - int off = res.index() + offset; - if (auto vecTy = dyn_cast(res.value())) { - auto eleTy = vecTy.getElementType(); - auto ptrTy = cudaq::cc::PointerType::get(eleTy); - auto gep0 = builder.create( - loc, cudaq::cc::PointerType::get(ptrTy), temp, - SmallVector{0, off, 0}); - auto dataPtr = builder.create(loc, gep0); - auto lenPtrTy = cudaq::cc::PointerType::get(builder.getI64Type()); - auto gep1 = builder.create( - loc, lenPtrTy, temp, - SmallVector{0, off, 1}); - auto vecLen = builder.create(loc, gep1); - if (vecTy.getElementType() == builder.getI1Type()) { - genStdvecBoolFromInitList(loc, builder, - hostFuncEntryBlock->getArguments().front(), - dataPtr, vecLen); - } else { - cudaq::IRBuilder irBuilder(builder); - Value tSize = irBuilder.getByteSizeOfType(loc, eleTy); - if (!tSize) { - TODO_loc(loc, "unhandled vector element type"); - return; - } - genStdvecTFromInitList(loc, builder, - hostFuncEntryBlock->getArguments().front(), - dataPtr, tSize, vecLen); - } - offset++; + auto hostFuncTy = hostFunc.getFunctionType(); + assert((hostFuncTy.getResults().empty() || + (hostFuncTy.getNumResults() == 1)) && + "C++ function expected to have 0 or 1 return value"); + const bool resultVal = !hostFuncTy.getResults().empty(); + if (resultVal || cudaq::opt::factory::hasSRet(hostFunc)) { + // Host function returns a value. Either returning by value or via an sret + // reference. + if (resultVal) { + Type res0Ty = structTy.getMember(offset); + auto ptrResTy = cudaq::cc::PointerType::get(res0Ty); + auto resPtr = builder.create( + loc, ptrResTy, temp, ArrayRef{0, offset}); + Type castToTy = cudaq::cc::PointerType::get(hostFuncTy.getResult(0)); + auto castResPtr = [&]() -> Value { + if (castToTy == ptrResTy) + return resPtr; + return builder.create(loc, castToTy, resPtr); + }(); + results.push_back(builder.create(loc, castResPtr)); } else { - auto gep0 = builder.create( - loc, cudaq::cc::PointerType::get(structTy.getMember(off)), temp, - SmallVector{0, off}); - auto gep = cudaq::opt::factory::createCast( - builder, loc, cudaq::cc::PointerType::get(res.value()), gep0); - Value loadVal = builder.create(loc, gep); - if (hiddenSRet) { - auto sretPtr = [&]() -> Value { - if (multiResult) - return builder.create( - loc, cudaq::cc::PointerType::get(res.value()), - hostFuncEntryBlock->getArguments().front(), - SmallVector{off}); - return builder.create( - loc, cudaq::cc::PointerType::get(res.value()), - hostFuncEntryBlock->getArguments().front()); - }(); - builder.create(loc, loadVal, sretPtr); + // Check if device is returning a span. If it is, then we will need to + // convert it to a std::vector here. The vector is constructed in-place + // on the sret memory block. + Value arg0 = hostFuncEntryBlock->getArguments().front(); + if (auto spanTy = + dyn_cast(devFuncTy.getResult(0))) { + auto eleTy = spanTy.getElementType(); + auto ptrTy = cudaq::cc::PointerType::get(eleTy); + auto gep0 = builder.create( + loc, cudaq::cc::PointerType::get(ptrTy), temp, + SmallVector{0, offset, 0}); + auto dataPtr = builder.create(loc, gep0); + auto lenPtrTy = cudaq::cc::PointerType::get(i64Ty); + auto gep1 = builder.create( + loc, lenPtrTy, temp, + SmallVector{0, offset, 1}); + auto vecLen = builder.create(loc, gep1); + if (spanTy.getElementType() == builder.getI1Type()) { + genStdvecBoolFromInitList(loc, builder, arg0, dataPtr, vecLen); + } else { + Value tSize = + builder.create(loc, i64Ty, eleTy); + genStdvecTFromInitList(loc, builder, arg0, dataPtr, tSize, vecLen); + } } else { - results.push_back(loadVal); + // Otherwise, we can just copy the aggregate into the sret memory + // block. Uses the size of the host function's sret pointer element + // type for the memcpy, so the device should return an (aggregate) + // value of suitable size. + Type res0Ty = structTy.getMember(offset); + auto ptrResTy = cudaq::cc::PointerType::get(res0Ty); + auto resPtr = builder.create( + loc, ptrResTy, temp, + ArrayRef{0, offset}); + auto castMsgBuff = + builder.create(loc, ptrI8Ty, resPtr); + Type eleTy = + cast(arg0.getType()).getElementType(); + Value bytes = builder.create(loc, i64Ty, eleTy); + auto notVolatile = builder.create(loc, 0, 1); + auto castArg0 = builder.create(loc, ptrI8Ty, arg0); + builder.create( + loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, + ValueRange{castArg0, castMsgBuff, bytes, notVolatile}); } } } + + // Return the result (if any). builder.create(loc, results); } @@ -1779,45 +1713,11 @@ class GenerateKernelExecution cudaq::opt::factory::createGlobalCtorCall( module, FlatSymbolRefAttr::get(ctx, initFun.getName())); - SmallVector deadCalls; - module.walk([&](func::CallOp call) { - if (!call.getResults().empty()) { - auto callee = call.getCallee(); - auto iter = modifiedDevKernels.find(callee); - if (iter != modifiedDevKernels.end()) { - OpBuilder builder(call); - Type ty = call.getResult(0).getType(); - auto loc = call.getLoc(); - auto strTy = cast( - cast(iter->second).getElementType()); - auto buff = builder.create(loc, strTy); - SmallVector args = {buff}; - args.append(call.getOperands().begin(), call.getOperands().end()); - builder.create(loc, TypeRange{}, callee, args); - auto buffPtrPtr = builder.create( - loc, cudaq::cc::PointerType::get(strTy.getMember(0)), buff, - ArrayRef{0}); - auto buffPtr = builder.create(loc, buffPtrPtr); - auto buffSizePtr = builder.create( - loc, cudaq::cc::PointerType::get(strTy.getMember(1)), buff, - ArrayRef{1}); - auto buffSize = builder.create(loc, buffSizePtr); - auto sv = builder.create(loc, ty, buffPtr, - buffSize); - call.getResult(0).replaceAllUsesWith(sv); - deadCalls.push_back(call); - } - } - }); - for (auto *op : deadCalls) - op->erase(); - LLVM_DEBUG(llvm::dbgs() << "final module:\n" << module << '\n'); } out.keep(); } const DataLayout *dataLayout = nullptr; - DenseMap modifiedDevKernels; }; } // namespace diff --git a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp index ba4a87c29e..9328b78896 100644 --- a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp +++ b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp @@ -191,8 +191,10 @@ class AllocaPattern : public OpRewritePattern { toErase.push_back(user); } if (toGlobal) { - rewriter.setInsertionPointAfter(alloc); - rewriter.replaceOp(alloc, conGlobal); + if (conGlobal) { + rewriter.setInsertionPointAfter(alloc); + rewriter.replaceOp(alloc, conGlobal); + } } else { toErase.push_back(alloc); } diff --git a/targettests/execution/auto_kernel-cpp17.cpp b/targettests/execution/auto_kernel-cpp17.cpp index f3b2f3dc65..04b0353113 100644 --- a/targettests/execution/auto_kernel-cpp17.cpp +++ b/targettests/execution/auto_kernel-cpp17.cpp @@ -7,7 +7,7 @@ ******************************************************************************/ // REQUIRES: c++17 -// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s +// RUN: nvq++ -std=c++17 --enable-mlir %s -o %t && %t | FileCheck %s #include diff --git a/targettests/execution/auto_kernel.cpp b/targettests/execution/auto_kernel.cpp index f52b13a7f0..1aec262e2a 100644 --- a/targettests/execution/auto_kernel.cpp +++ b/targettests/execution/auto_kernel.cpp @@ -7,7 +7,7 @@ ******************************************************************************/ // REQUIRES: c++20 -// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s +// RUN: nvq++ --enable-mlir %s -o %t && %t | FileCheck %s #include diff --git a/test/AST-Quake/calling_convention.cpp b/test/AST-Quake/calling_convention.cpp new file mode 100644 index 0000000000..3d2c6e2e4a --- /dev/null +++ b/test/AST-Quake/calling_convention.cpp @@ -0,0 +1,335 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +// This test is only valid for x86_64. +// RUN: if [ `uname -m` = "x86_64" ] ; then \ +// RUN: cudaq-quake %cpp_std %s | cudaq-opt | FileCheck %s ; fi + +#include +#include +#include + +// Tests the host-side signatures of various spec supported kernel arguments and +// results. This file tests the x86_64 calling convention. Other architectures +// differ in their calling conventions. + +//===----------------------------------------------------------------------===// +// test all the basic arithmetic types to deny any regressions. + +struct T0 { + void operator()() __qpu__ {} +}; + +struct T1 { + void operator()(double arg) __qpu__ {} +}; + +struct T2 { + void operator()(float arg) __qpu__ {} +}; + +struct T3 { + void operator()(long long arg) __qpu__ {} +}; + +struct T4 { + void operator()(long arg) __qpu__ {} +}; + +struct T5 { + void operator()(int arg) __qpu__ {} +}; + +struct T6 { + void operator()(short arg) __qpu__ {} +}; + +struct T7 { + void operator()(char arg) __qpu__ {} +}; + +struct T8 { + void operator()(bool arg) __qpu__ {} +}; + +// CHECK-LABEL: func.func @_ZN2T0clEv( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr) { +// CHECK-LABEL: func.func @_ZN2T1clEd( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: f64) { +// CHECK-LABEL: func.func @_ZN2T2clEf( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: f32) { +// CHECK-LABEL: func.func @_ZN2T3clEx( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i64) { +// CHECK-LABEL: func.func @_ZN2T4clEl( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i64) { +// CHECK-LABEL: func.func @_ZN2T5clEi( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i32) { +// CHECK-LABEL: func.func @_ZN2T6clEs( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i16) { +// CHECK-LABEL: func.func @_ZN2T7clEc( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i8) { +// CHECK-LABEL: func.func @_ZN2T8clEb( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i1) { + +struct R0 { + void operator()() __qpu__ {} +}; + +struct R1 { + double operator()() __qpu__ { return {}; } +}; + +struct R2 { + float operator()() __qpu__ { return {}; } +}; + +struct R3 { + long long operator()() __qpu__ { return {}; } +}; + +struct R4 { + long operator()() __qpu__ { return {}; } +}; + +struct R5 { + int operator()() __qpu__ { return {}; } +}; + +struct R6 { + short operator()() __qpu__ { return {}; } +}; + +struct R7 { + char operator()() __qpu__ { return {}; } +}; + +struct R8 { + bool operator()() __qpu__ { return {}; } +}; + +// CHECK-LABEL: func.func @_ZN2R0clEv(%arg0: !cc.ptr) { +// CHECK-LABEL: func.func @_ZN2R1clEv(%arg0: !cc.ptr) -> f64 { +// CHECK-LABEL: func.func @_ZN2R2clEv(%arg0: !cc.ptr) -> f32 { +// CHECK-LABEL: func.func @_ZN2R3clEv(%arg0: !cc.ptr) -> i64 { +// CHECK-LABEL: func.func @_ZN2R4clEv(%arg0: !cc.ptr) -> i64 { +// CHECK-LABEL: func.func @_ZN2R5clEv(%arg0: !cc.ptr) -> i32 { +// CHECK-LABEL: func.func @_ZN2R6clEv(%arg0: !cc.ptr) -> i16 { +// CHECK-LABEL: func.func @_ZN2R7clEv(%arg0: !cc.ptr) -> i8 { +// CHECK-LABEL: func.func @_ZN2R8clEv(%arg0: !cc.ptr) -> i1 { + +//===----------------------------------------------------------------------===// +// structs that are less than 128 bits. +// arguments may be merged into 1 register or passed in pair of registers. +// results are returned in registers. + +struct G0 { + std::pair operator()(std::pair) __qpu__ { + return {}; + } +}; + +struct G1 { + std::pair operator()(std::pair) __qpu__ { + return {}; + } +}; + +struct G2 { + std::pair operator()(std::pair, + std::pair) __qpu__ { + return {}; + } +}; + +struct G3 { + std::pair operator()(std::pair) __qpu__ { + return {}; + } +}; + +struct BB { + bool _1; + bool _2; + bool _3; +}; + +BB glue0(); + +struct G4 { + std::pair operator()(BB) __qpu__ { return {}; } +}; + +struct II { + int _1; + int _2; + int _3; +}; + +II glue1(); + +struct G5 { + std::pair operator()(II) __qpu__ { return {}; } +}; + +struct CC { + char _1; + unsigned char _2; + signed char _3; +}; + +CC glue2(); + +struct G6 { + std::pair operator()(CC) __qpu__ { return {}; } +}; + +struct G7 { + BB operator()(BB, II, CC) __qpu__ { return glue0(); } +}; + +struct G8 { + II operator()(II, CC, BB) __qpu__ { return glue1(); } +}; + +struct G9 { + CC operator()(CC, BB, II) __qpu__ { return glue2(); } +}; + +// clang-format off +// CHECK-LABEL: func.func @_ZN2G0clESt4pairIddE( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: f64, +// CHECK-SAME: %[[VAL_2:.*]]: f64) -> i16 +// CHECK-LABEL: func.func @_ZN2G1clESt4pairIffE( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: vector<2xf32>) +// CHECK-SAME: -> i16 +// CHECK-LABEL: func.func @_ZN2G2clESt4pairIllES0_IidE( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, +// CHECK-SAME: %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i32, +// CHECK-SAME: %[[VAL_4:.*]]: f64) -> i24 +// CHECK-LABEL: func.func @_ZN2G3clESt4pairIdbE( +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, %[[VAL_2:.*]]: f64, +// CHECK-SAME: %[[VAL_3:.*]]: i8) -> i32 +// CHECK-LABEL: func.func @_ZN2G4clE2BB( +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, %[[VAL_2:.*]]: i24) -> i64 +// CHECK-LABEL: func.func @_ZN2G5clE2II( +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i32) -> !cc.struct<{i64, f32}> +// CHECK-LABEL: func.func @_ZN2G6clE2CC( +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, %[[VAL_2:.*]]: i24) -> !cc.struct<{i64, i64}> +// CHECK-LABEL: func.func @_ZN2G7clE2BB2II2CC( +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, %[[VAL_2:.*]]: i24, +// CHECK-SAME: %[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i32, +// CHECK-SAME: %[[VAL_5:.*]]: i24) -> i24 +// CHECK-LABEL: func.func @_ZN2G8clE2II2CC2BB( +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, %[[VAL_2:.*]]: i64, +// CHECK-SAME: %[[VAL_3:.*]]: i32, %[[VAL_4:.*]]: i24, +// CHECK-SAME: %[[VAL_5:.*]]: i24) -> !cc.struct<{i64, i32}> +// CHECK-LABEL: func.func @_ZN2G9clE2CC2BB2II( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, +// CHECK-SAME: %[[VAL_1:.*]]: i24, %[[VAL_2:.*]]: i24, +// CHECK-SAME: %[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i32) -> i24 +// clang-format on + +//===----------------------------------------------------------------------===// +// std::vector - these get converted to sret and byval ptrs on host side. + +std::vector make_believe(); + +struct V0 { + std::vector operator()() __qpu__ { return make_believe(); } +}; + +std::vector make_coffee(); + +struct V1 { + std::vector operator()(std::vector) __qpu__ { + return make_coffee(); + } +}; + +std::vector> make_crazy(); + +struct V2 { + std::vector> operator()(std::vector, + std::vector) __qpu__ { + return make_crazy(); + } +}; + +struct V3 { + void operator()(std::vector, std::vector) __qpu__ {} +}; + +// clang-format off +// CHECK-LABEL: func.func @_ZN2V0clEv( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr) +// CHECK-LABEL: func.func @_ZN2V1clESt6vectorIdSaIdEE( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, +// CHECK-SAME: %[[VAL_2:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>) +// CHECK-LABEL: func.func @_ZN2V2clESt6vectorIfSaIfEES0_IsSaIsEE( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr>, !cc.ptr>, !cc.ptr>}>> {llvm.sret = !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}>}, +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, +// CHECK-SAME: %[[VAL_2:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>, +// CHECK-SAME: %[[VAL_3:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>) +// CHECK-LABEL: func.func @_ZN2V3clESt6vectorIlSaIlEES0_IbSaIbEE( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>, +// CHECK-SAME: %[[VAL_2:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>) +// clang-format on + +//===----------------------------------------------------------------------===// +// structs that are more than 128 bits. These get converted to sret or byval +// ptrs on the host side. + +struct B0 { + void operator()(std::tuple) __qpu__ {} +}; + +struct BG { + float _1[4]; + int _2[5]; +}; + +BG make_sausage(); + +struct B1 { + BG operator()() __qpu__ { return make_sausage(); } +}; + +std::tuple make_interesting(); + +struct B2 { + std::tuple operator()(BG) __qpu__ { + return make_interesting(); + } +}; + +struct BA { + bool _1[64]; +}; + +struct B3 { + BA operator()(BA arg) __qpu__ { return arg; } +}; + +// clang-format off +// CHECK-LABEL: func.func @_ZN2B0clESt5tupleIJdicfsEE( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: !cc.ptr>) { +// CHECK-LABEL: func.func @_ZN2B1clEv( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, !cc.array} [288,4]>> {llvm.sret = !cc.struct<"BG" {!cc.array, !cc.array} [288,4]>}, +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr) +// CHECK-LABEL: func.func @_ZN2B2clE2BG( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr> {llvm.sret = !cc.struct<{f64, f64, i16, f32, i8, i32}>}, +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, +// CHECK-SAME: %[[VAL_2:.*]]: !cc.ptr, !cc.array} [288,4]>> {llvm.byval = !cc.struct<"BG" {!cc.array, !cc.array} [288,4]>}) +// CHECK-LABEL: func.func @_ZN2B3clE2BA( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr} [512,1]>> {llvm.sret = !cc.struct<"BA" {!cc.array} [512,1]>}, +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, +// CHECK-SAME: %[[VAL_2:.*]]: !cc.ptr} [512,1]>> {llvm.byval = !cc.struct<"BA" {!cc.array} [512,1]>}) +// clang-format on diff --git a/test/AST-Quake/vector_int-1.cpp b/test/AST-Quake/vector_int-1.cpp index 3bdfae634f..a5a989f6bf 100644 --- a/test/AST-Quake/vector_int-1.cpp +++ b/test/AST-Quake/vector_int-1.cpp @@ -22,8 +22,7 @@ __qpu__ void touringLondon() { return; } -// CHECK-LABEL: func.func @__nvqpp__mlirgen__function_doubleDeckerBus._Z15doubleDeckerBusv( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, i64}>> {llvm.sret = !cc.struct<{!cc.ptr, i64}>}) attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} { +// CHECK-LABEL: func.func @__nvqpp__mlirgen__function_doubleDeckerBus._Z15doubleDeckerBusv() -> !cc.stdvec attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} { // CHECK: %[[VAL_1:.*]] = arith.constant 2 : i64 // CHECK: %[[VAL_2:.*]] = arith.constant 4 : i64 // CHECK: %[[VAL_3:.*]] = arith.constant 2 : i32 @@ -33,29 +32,15 @@ __qpu__ void touringLondon() { // CHECK: %[[VAL_6:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr>) -> !cc.ptr // CHECK: %[[VAL_7:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_6]], %[[VAL_1]], %[[VAL_2]]) : (!cc.ptr, i64, i64) -> !cc.ptr // CHECK: %[[VAL_8:.*]] = cc.stdvec_init %[[VAL_7]], %[[VAL_1]] : (!cc.ptr, i64) -> !cc.stdvec -// CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, i64}>>) -> !cc.ptr, i64}>> -// CHECK: %[[VAL_10:.*]] = cc.stdvec_data %[[VAL_8]] : (!cc.stdvec) -> !cc.ptr -// CHECK: %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_9]][0] : (!cc.ptr, i64}>>) -> !cc.ptr> -// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_10]], %[[VAL_12]] : !cc.ptr> -// CHECK: %[[VAL_13:.*]] = cc.stdvec_size %[[VAL_8]] : (!cc.stdvec) -> i64 -// CHECK: %[[VAL_14:.*]] = cc.compute_ptr %[[VAL_9]][1] : (!cc.ptr, i64}>>) -> !cc.ptr -// CHECK: cc.store %[[VAL_13]], %[[VAL_14]] : !cc.ptr -// CHECK: return +// CHECK: return %[[VAL_8]] : !cc.stdvec // CHECK: } // CHECK-LABEL: func.func @__nvqpp__mlirgen__function_touringLondon._Z13touringLondonv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} { -// CHECK: %[[VAL_0:.*]] = cc.alloca !cc.struct<{!cc.ptr, i64}> -// CHECK: call @__nvqpp__mlirgen__function_doubleDeckerBus._Z15doubleDeckerBusv(%[[VAL_0]]) : (!cc.ptr, i64}>>) -> () -// CHECK: %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_0]][0] : (!cc.ptr, i64}>>) -> !cc.ptr> -// CHECK: %[[VAL_1:.*]] = cc.load %[[VAL_10]] : !cc.ptr> -// CHECK: %[[VAL_2:.*]] = cc.compute_ptr %[[VAL_0]][1] : (!cc.ptr, i64}>>) -> !cc.ptr -// CHECK: %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr -// CHECK: %[[VAL_4:.*]] = cc.stdvec_init %[[VAL_1]], %[[VAL_3]] : (!cc.ptr, i64) -> !cc.stdvec -// CHECK: %[[VAL_5:.*]] = cc.stdvec_data %[[VAL_4]] : (!cc.stdvec) -> !cc.ptr> +// CHECK: %[[VAL_0:.*]] = call @__nvqpp__mlirgen__function_doubleDeckerBus._Z15doubleDeckerBusv() : () -> !cc.stdvec +// CHECK: %[[VAL_5:.*]] = cc.stdvec_data %[[VAL_0]] : (!cc.stdvec) -> !cc.ptr> // CHECK: %[[VAL_6:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr>) -> !cc.ptr // CHECK: %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr // CHECK: %[[VAL_8:.*]] = cc.cast signed %[[VAL_7]] : (i32) -> i64 -// CHECK: %[[VAL_9:.*]] = quake.alloca !quake.veq{{\[}}%[[VAL_8]] : i64] +// CHECK: %[[VAL_9:.*]] = quake.alloca !quake.veq[%[[VAL_8]] : i64] // CHECK: return // CHECK: } diff --git a/test/Quake-QIR/return_values.qke b/test/Quake-QIR/return_values.qke index a4fbfa7477..085b9fec97 100644 --- a/test/Quake-QIR/return_values.qke +++ b/test/Quake-QIR/return_values.qke @@ -6,20 +6,22 @@ // the terms of the Apache License 2.0 which accompanies this distribution. // // ========================================================================== // -// RUN: cudaq-opt --add-dealloc --kernel-execution=codegen=1 --canonicalize %s | \ -// RUN: cudaq-translate --convert-to=qir | FileCheck %s +// RUN: cudaq-opt --add-dealloc --kernel-execution=codegen=1 --canonicalize %s | cudaq-translate --convert-to=qir | FileCheck %s // NB: the mangled name map is required for the kernel-execution pass. +// QIR codegen requires the target triple. module attributes{ quake.mangled_name_map = { __nvqpp__mlirgen__test_0 = "test_0", __nvqpp__mlirgen__test_1 = "test_1", __nvqpp__mlirgen__test_2 = "test_2", __nvqpp__mlirgen__test_3 = "test_3", __nvqpp__mlirgen__test_4 = "test_4", - __nvqpp__mlirgen__test_5 = "test_5" }} { + __nvqpp__mlirgen__test_5 = "test_5" }, + llvm.triple = "x86_64-unknown-linux-gnu"} { func.func private @__nvqpp_vectorCopyCtor(%arg0: !cc.ptr , %arg1: i64 , %arg2: i64 ) -> !cc.ptr +// vector -> struct ptr sret func.func @__nvqpp__mlirgen__test_0(%arg0: i32) -> !cc.stdvec { %c1_i64 = arith.constant 1 : i64 %c1 = arith.constant 1 : i64 @@ -56,8 +58,8 @@ func.func @test_0(%1: !cc.ptr, !cc.ptr, !cc.ptr} return } -// CHECK-LABEL: define void @__nvqpp__mlirgen__test_0({ i8*, i64 }* nocapture writeonly sret({ i8*, i64 }) -// CHECK-SAME: %[[VAL_0:.*]], i32 %[[VAL_1:.*]]) {{.*}}{ +// CHECK-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__test_0( +// CHECK-SAME: i32 %[[VAL_1:.*]]) {{.*}}{ // CHECK: %[[VAL_2:.*]] = sext i32 %[[VAL_1]] to i64 // CHECK: %[[VAL_3:.*]] = tail call %[[VAL_4:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_2]]) // CHECK: %[[VAL_5:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_4]]* %[[VAL_3]]) @@ -95,12 +97,11 @@ func.func @test_0(%1: !cc.ptr, !cc.ptr, !cc.ptr} // CHECK: ._crit_edge5: ; preds = %[[VAL_21]], %[[VAL_8]], %[[VAL_19]] // CHECK: %[[VAL_34:.*]] = phi i8* [ %[[VAL_10]], %[[VAL_8]] ], [ %[[VAL_20]], %[[VAL_19]] ], [ %[[VAL_20]], %[[VAL_21]] ] // CHECK: %[[VAL_35:.*]] = call i8* @__nvqpp_vectorCopyCtor(i8* nonnull %[[VAL_34]], i64 %[[VAL_5]], i64 1) -// CHECK: call void @__quantum__rt__qubit_release_array(%[[VAL_4]]* %[[VAL_3]]) -// CHECK: %[[VAL_36:.*]] = getelementptr inbounds { i8*, i64 }, { i8*, i64 }* %[[VAL_0]], i64 0, i32 0 -// CHECK: store i8* %[[VAL_35]], i8** %[[VAL_36]], align 8 -// CHECK: %[[VAL_37:.*]] = getelementptr { i8*, i64 }, { i8*, i64 }* %[[VAL_0]], i64 0, i32 1 -// CHECK: store i64 %[[VAL_5]], i64* %[[VAL_37]], align 8 -// CHECK: ret void +// CHECK: %[[VAL_36:.*]] = bitcast i8* %[[VAL_35]] to i1* +// CHECK: %[[VAL_37:.*]] = insertvalue { i1*, i64 } undef, i1* %[[VAL_36]], 0 +// CHECK: %[[VAL_38:.*]] = insertvalue { i1*, i64 } %[[VAL_37]], i64 %[[VAL_5]], 1 +// CHECK: call void @__quantum__rt__qubit_release_array(%Array* %[[VAL_3]]) +// CHECK: ret { i1*, i64 } %[[VAL_38]] // CHECK: } // CHECK-LABEL: define void @test_0({ i8*, i8*, i8* }* sret({ i8*, i8*, i8* }) @@ -120,6 +121,7 @@ func.func @test_0(%1: !cc.ptr, !cc.ptr, !cc.ptr} // CHECK: ret void // CHECK: } +// struct{bool, bool} -> i16 func.func @__nvqpp__mlirgen__test_1() -> !cc.struct<{i1, i1}> { %qubits = quake.alloca !quake.veq<2> %q0 = quake.extract_ref %qubits[0] : (!quake.veq<2>) -> !quake.ref @@ -136,12 +138,12 @@ func.func @__nvqpp__mlirgen__test_1() -> !cc.struct<{i1, i1}> { return %rv2 : !cc.struct<{i1, i1}> } -func.func @test_1(%1: !cc.ptr> {llvm.sret = !cc.struct<{i1, i1}>}, %this: !cc.ptr) { - return +func.func @test_1(%this: !cc.ptr) -> i16 { + %0 = cc.undef i16 + return %0 : i16 } -// CHECK-LABEL: define void @__nvqpp__mlirgen__test_1({ i1, i1 }* nocapture writeonly sret({ i1, i1 }) -// CHECK-SAME: %[[VAL_0:.*]]) {{.*}}{ +// CHECK-LABEL: define { i1, i1 } @__nvqpp__mlirgen__test_1() // CHECK: %[[VAL_1:.*]] = tail call %[[VAL_2:.*]]* @__quantum__rt__qubit_allocate_array(i64 2) // CHECK: %[[VAL_3:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_2]]* %[[VAL_1]], i64 0) // CHECK: %[[VAL_4:.*]] = bitcast i8* %[[VAL_3]] to %[[VAL_5:.*]]** @@ -152,37 +154,27 @@ func.func @test_1(%1: !cc.ptr> {llvm.sret = !cc.struct<{i1, // CHECK: tail call void @__quantum__qis__h(%[[VAL_5]]* %[[VAL_6]]) // CHECK: tail call void (i64, void (%[[VAL_2]]*, %[[VAL_5]]*)*, ...) @invokeWithControlQubits(i64 1, void (%[[VAL_2]]*, %[[VAL_5]]*)* nonnull @__quantum__qis__x__ctl, %[[VAL_5]]* %[[VAL_6]], %[[VAL_5]]* %[[VAL_9]]) // CHECK: %[[VAL_10:.*]] = tail call %[[VAL_11:.*]]* @__quantum__qis__mz(%[[VAL_5]]* %[[VAL_6]]) -// CHECK: %[[VAL_12:.*]] = bitcast %[[VAL_11]]* %[[VAL_10]] to i1* +// CHECK: %[[VAL_12:.*]] = bitcast %Result* %[[VAL_10]] to i1* // CHECK: %[[VAL_13:.*]] = load i1, i1* %[[VAL_12]], align 1 // CHECK: %[[VAL_14:.*]] = tail call %[[VAL_11]]* @__quantum__qis__mz(%[[VAL_5]]* %[[VAL_9]]) -// CHECK: %[[VAL_15:.*]] = bitcast %[[VAL_11]]* %[[VAL_14]] to i1* +// CHECK: %[[VAL_15:.*]] = bitcast %Result* %[[VAL_14]] to i1* // CHECK: %[[VAL_16:.*]] = load i1, i1* %[[VAL_15]], align 1 -// CHECK: %[[VAL_17:.*]] = getelementptr inbounds { i1, i1 }, { i1, i1 }* %[[VAL_0]], i64 0, i32 0 -// CHECK: store i1 %[[VAL_13]], i1* %[[VAL_17]], align 1 -// CHECK: %[[VAL_18:.*]] = getelementptr inbounds { i1, i1 }, { i1, i1 }* %[[VAL_0]], i64 0, i32 1 -// CHECK: store i1 %[[VAL_16]], i1* %[[VAL_18]], align 1 +// CHECK: %[[VAL_20:.*]] = insertvalue { i1, i1 } undef, i1 %[[VAL_13]], 0 +// CHECK: %[[VAL_19:.*]] = insertvalue { i1, i1 } %[[VAL_20]], i1 %[[VAL_16]], 1 // CHECK: tail call void @__quantum__rt__qubit_release_array(%[[VAL_2]]* %[[VAL_1]]) -// CHECK: ret void +// CHECK: ret { i1, i1 } %[[VAL_19]] // CHECK: } -// CHECK-LABEL: define void @test_1({ i1, i1 }* nocapture writeonly sret({ i1, i1 }) -// CHECK-SAME: %[[VAL_0:.*]], i8* nocapture readnone %[[VAL_1:.*]]) {{.*}}{ -// CHECK: %[[VAL_2:.*]] = alloca [2 x i8], align 8 -// CHECK: %[[VAL_3:.*]] = getelementptr inbounds [2 x i8], [2 x i8]* %[[VAL_2]], i64 0, i64 0 +// CHECK-LABEL: define i16 @test_1(i8* nocapture readnone +// CHECK-SAME: %[[VAL_0:.*]]) {{.*}}{ +// CHECK-NEXT: %[[VAL_2:.*]] = alloca i16, align 8 +// CHECK: %[[VAL_3:.*]] = bitcast i16* %[[VAL_2]] to i8* // CHECK: call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_1.thunk to i8*), i8* nonnull %[[VAL_3]], i64 2, i64 0) -// CHECK: %[[VAL_4:.*]] = bitcast [2 x i8]* %[[VAL_2]] to i1* -// CHECK: %[[VAL_5:.*]] = load i1, i1* %[[VAL_4]], align 8 -// CHECK: %[[VAL_6:.*]] = getelementptr inbounds [2 x i8], [2 x i8]* %[[VAL_2]], i64 0, i64 1 -// CHECK: %[[VAL_7:.*]] = bitcast i8* %[[VAL_6]] to i1* -// CHECK: %[[VAL_8:.*]] = load i1, i1* %[[VAL_7]], align 1 -// CHECK: %[[VAL_9:.*]] = getelementptr inbounds { i1, i1 }, { i1, i1 }* %[[VAL_0]], i64 0, i32 0 -// CHECK: store i1 %[[VAL_5]], i1* %[[VAL_9]], align 1 -// CHECK: %[[VAL_10:.*]] = getelementptr inbounds { i1, i1 }, { i1, i1 }* %[[VAL_0]], i64 0, i32 1 -// CHECK: store i1 %[[VAL_8]], i1* %[[VAL_10]], align 1 -// CHECK: ret void +// CHECK: %[[VAL_4:.*]] = load i16, i16* %[[VAL_2]], align 8 +// CHECK: ret i16 %[[VAL_4]] // CHECK: } - +// struct{i16, f32, f64, i64} -> sret ptr func.func @__nvqpp__mlirgen__test_2() -> !cc.struct<{i16, f32, f64, i64}> { %rv = cc.undef !cc.struct<{i16, f32, f64, i64}> %c1 = arith.constant 8 : i16 @@ -200,10 +192,8 @@ func.func @test_2(%1: !cc.ptr> {llvm.sret = !cc return } -// CHECK-LABEL: define void @__nvqpp__mlirgen__test_2({ i16, float, double, i64 }* nocapture writeonly sret({ i16, float, double, i64 }) -// CHECK-SAME: %[[VAL_0:.*]]) {{.*}}{ -// CHECK: store { i16, float, double, i64 } { i16 8, float 0x40159999A0000000, double 3.783000e+01, i64 1479 }, { i16, float, double, i64 }* %[[VAL_0]], align 8 -// CHECK: ret void +// CHECK-LABEL: define { i16, float, double, i64 } @__nvqpp__mlirgen__test_2() +// CHECK: ret { i16, float, double, i64 } { i16 8, float 0x40159999A0000000, double 3.783000e+01, i64 1479 } // CHECK: } // CHECK-LABEL: define void @test_2({ i16, float, double, i64 }* nocapture writeonly sret({ i16, float, double, i64 }) @@ -211,22 +201,12 @@ func.func @test_2(%1: !cc.ptr> {llvm.sret = !cc // CHECK: %[[VAL_2:.*]] = alloca { { i16, float, double, i64 } }, align 8 // CHECK: %[[VAL_3:.*]] = bitcast { { i16, float, double, i64 } }* %[[VAL_2]] to i8* // CHECK: call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_2.thunk to i8*), i8* nonnull %[[VAL_3]], i64 24, i64 0) -// CHECK: %[[VAL_4:.*]] = getelementptr inbounds { { i16, float, double, i64 } }, { { i16, float, double, i64 } }* %[[VAL_2]], i64 0, i32 0, i32 0 -// CHECK: %[[VAL_5:.*]] = load i16, i16* %[[VAL_4]], align 8 -// CHECK: %[[VAL_6:.*]] = insertvalue { i16, float, double, i64 } poison, i16 %[[VAL_5]], 0 -// CHECK: %[[VAL_7:.*]] = getelementptr inbounds { { i16, float, double, i64 } }, { { i16, float, double, i64 } }* %[[VAL_2]], i64 0, i32 0, i32 1 -// CHECK: %[[VAL_8:.*]] = load float, float* %[[VAL_7]], align 4 -// CHECK: %[[VAL_9:.*]] = insertvalue { i16, float, double, i64 } %[[VAL_6]], float %[[VAL_8]], 1 -// CHECK: %[[VAL_10:.*]] = getelementptr inbounds { { i16, float, double, i64 } }, { { i16, float, double, i64 } }* %[[VAL_2]], i64 0, i32 0, i32 2 -// CHECK: %[[VAL_11:.*]] = load double, double* %[[VAL_10]], align 8 -// CHECK: %[[VAL_12:.*]] = insertvalue { i16, float, double, i64 } %[[VAL_9]], double %[[VAL_11]], 2 -// CHECK: %[[VAL_13:.*]] = getelementptr inbounds { { i16, float, double, i64 } }, { { i16, float, double, i64 } }* %[[VAL_2]], i64 0, i32 0, i32 3 -// CHECK: %[[VAL_14:.*]] = load i64, i64* %[[VAL_13]], align 8 -// CHECK: %[[VAL_15:.*]] = insertvalue { i16, float, double, i64 } %[[VAL_12]], i64 %[[VAL_14]], 3 -// CHECK: store { i16, float, double, i64 } %[[VAL_15]], { i16, float, double, i64 }* %[[VAL_0]], align 8 +// CHECK: %[[VAL_4:.*]] = bitcast { i16, float, double, i64 }* %[[VAL_0]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(24) %[[VAL_4]], i8* noundef nonnull align 8 dereferenceable(24) %[[VAL_3]], i64 24, i1 false) // CHECK: ret void // CHECK: } +// array -> sret ptr func.func @__nvqpp__mlirgen__test_3() -> !cc.array { %rv = cc.undef !cc.array %c1 = arith.constant 5 : i64 @@ -246,19 +226,8 @@ func.func @test_3(%1: !cc.ptr> {llvm.sret = !cc.array> {llvm.sret = !cc.array { i64, f64 } func.func @__nvqpp__mlirgen__test_4() -> (i64, f64) { %c1 = arith.constant 537892 : i64 %c2 = arith.constant 94.2134 : f64 return %c1, %c2 : i64, f64 } -func.func @test_4(%1: !cc.ptr> {llvm.sret = !cc.struct<{i64, f64}>}, %this: !cc.ptr) { +func.func @test_4(%sret: !cc.ptr> {llvm.sret = !cc.struct<{i64, f64}>}, %this: !cc.ptr) { return } -// CHECK-LABEL: define void @__nvqpp__mlirgen__test_4({ i64, double }* nocapture writeonly sret({ i64, double }) -// CHECK-SAME: %[[VAL_0:.*]]) {{.*}}{ -// CHECK: %[[VAL_1:.*]] = getelementptr inbounds { i64, double }, { i64, double }* %[[VAL_0]], i64 0, i32 0 -// CHECK: store i64 537892, i64* %[[VAL_1]], align 8 -// CHECK: %[[VAL_2:.*]] = getelementptr { i64, double }, { i64, double }* %[[VAL_0]], i64 0, i32 1 -// CHECK: store double 0x40578DA858793DD9, double* %[[VAL_2]], align 8 -// CHECK: ret void +// CHECK-LABEL: define { i64, double } @__nvqpp__mlirgen__test_4() {{.*}}{ +// CHECK: ret { i64, double } { i64 537892, double 0x40578DA858793DD9 } // CHECK: } // CHECK-LABEL: define void @test_4({ i64, double }* nocapture writeonly sret({ i64, double }) @@ -313,14 +260,8 @@ func.func @test_4(%1: !cc.ptr> {llvm.sret = !cc.struct<{i // CHECK: %[[VAL_2:.*]] = alloca { i64, double }, align 8 // CHECK: %[[VAL_3:.*]] = bitcast { i64, double }* %[[VAL_2]] to i8* // CHECK: call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_4.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_4.thunk to i8*), i8* nonnull %[[VAL_3]], i64 16, i64 0) -// CHECK: %[[VAL_4:.*]] = getelementptr inbounds { i64, double }, { i64, double }* %[[VAL_2]], i64 0, i32 0 -// CHECK: %[[VAL_5:.*]] = load i64, i64* %[[VAL_4]], align 8 -// CHECK: %[[VAL_6:.*]] = getelementptr inbounds { i64, double }, { i64, double }* %[[VAL_0]], i64 0, i32 0 -// CHECK: store i64 %[[VAL_5]], i64* %[[VAL_6]], align 8 -// CHECK: %[[VAL_7:.*]] = getelementptr inbounds { i64, double }, { i64, double }* %[[VAL_2]], i64 0, i32 1 -// CHECK: %[[VAL_8:.*]] = load double, double* %[[VAL_7]], align 8 -// CHECK: %[[VAL_9:.*]] = getelementptr { i64, double }, { i64, double }* %[[VAL_0]], i64 0, i32 1 -// CHECK: store double %[[VAL_8]], double* %[[VAL_9]], align 8 +// CHECK: %[[VAL_4:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_4]], i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_3]], i64 16, i1 false) // CHECK: ret void // CHECK: } @@ -330,17 +271,12 @@ func.func @__nvqpp__mlirgen__test_5() -> (i64, f64) attributes {no_this} { return %c1, %c2 : i64, f64 } -func.func @test_5(%0: !cc.ptr> {llvm.sret = !cc.struct<{i64, f64}>}) { +func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct<{i64, f64}>}) { return } -// CHECK-LABEL: define void @__nvqpp__mlirgen__test_5({ i64, double }* nocapture writeonly sret({ i64, double }) -// CHECK-SAME: %[[VAL_0:.*]]) {{.*}}{ -// CHECK: %[[VAL_1:.*]] = getelementptr inbounds { i64, double }, { i64, double }* %[[VAL_0]], i64 0, i32 0 -// CHECK: store i64 537892, i64* %[[VAL_1]], align 8 -// CHECK: %[[VAL_2:.*]] = getelementptr { i64, double }, { i64, double }* %[[VAL_0]], i64 0, i32 1 -// CHECK: store double 0x40578DA858793DD9, double* %[[VAL_2]], align 8 -// CHECK: ret void +// CHECK-LABEL: define { i64, double } @__nvqpp__mlirgen__test_5() {{.*}}{ +// CHECK: ret { i64, double } { i64 537892, double 0x40578DA858793DD9 } // CHECK: } // CHECK-LABEL: define void @test_5({ i64, double }* nocapture writeonly sret({ i64, double }) @@ -348,14 +284,8 @@ func.func @test_5(%0: !cc.ptr> {llvm.sret = !cc.struct<{i // CHECK: %[[VAL_1:.*]] = alloca { i64, double }, align 8 // CHECK: %[[VAL_2:.*]] = bitcast { i64, double }* %[[VAL_1]] to i8* // CHECK: call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_5.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_5.thunk to i8*), i8* nonnull %[[VAL_2]], i64 16, i64 0) -// CHECK: %[[VAL_3:.*]] = getelementptr inbounds { i64, double }, { i64, double }* %[[VAL_1]], i64 0, i32 0 -// CHECK: %[[VAL_4:.*]] = load i64, i64* %[[VAL_3]], align 8 -// CHECK: %[[VAL_5:.*]] = getelementptr inbounds { i64, double }, { i64, double }* %[[VAL_0]], i64 0, i32 0 -// CHECK: store i64 %[[VAL_4]], i64* %[[VAL_5]], align 8 -// CHECK: %[[VAL_6:.*]] = getelementptr inbounds { i64, double }, { i64, double }* %[[VAL_1]], i64 0, i32 1 -// CHECK: %[[VAL_7:.*]] = load double, double* %[[VAL_6]], align 8 -// CHECK: %[[VAL_8:.*]] = getelementptr { i64, double }, { i64, double }* %[[VAL_0]], i64 0, i32 1 -// CHECK: store double %[[VAL_7]], double* %[[VAL_8]], align 8 +// CHECK: %[[VAL_3:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_3]], i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_2]], i64 16, i1 false) // CHECK: ret void // CHECK: } @@ -371,7 +301,6 @@ func.func @test_5(%0: !cc.ptr> {llvm.sret = !cc.struct<{i // CHECK-SAME: %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) { // CHECK: %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to i32* // CHECK: %[[VAL_3:.*]] = load i32, i32* %[[VAL_2]], align 4 -// CHECK: %[[VAL_4:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8 // CHECK: %[[VAL_5:.*]] = sext i32 %[[VAL_3]] to i64 // CHECK: %[[VAL_6:.*]] = tail call %[[VAL_7:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_5]]) // CHECK: %[[VAL_8:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_7]]* %[[VAL_6]]) @@ -392,7 +321,7 @@ func.func @test_5(%0: !cc.ptr> {llvm.sret = !cc.struct<{i // CHECK: ._crit_edge: ; preds = %[[VAL_10]] // CHECK: %[[VAL_23:.*]] = alloca i8, i64 %[[VAL_8]], align 1 // CHECK: br i1 %[[VAL_9]], label %[[VAL_24:.*]], label %[[VAL_14]] -// CHECK: .lr.ph4: ; preds = %[[VAL_22]], %[[VAL_24]] +// CHECK: [[VAL_24]]: ; preds = %[[VAL_22]], %[[VAL_24]] // CHECK: %[[VAL_25:.*]] = phi i64 [ %[[VAL_26:.*]], %[[VAL_24]] ], [ 0, %[[VAL_22]] ] // CHECK: %[[VAL_27:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_7]]* %[[VAL_6]], i64 %[[VAL_25]]) // CHECK: %[[VAL_28:.*]] = bitcast i8* %[[VAL_27]] to %[[VAL_19]]** @@ -406,20 +335,21 @@ func.func @test_5(%0: !cc.ptr> {llvm.sret = !cc.struct<{i // CHECK: %[[VAL_26]] = add nuw nsw i64 %[[VAL_25]], 1 // CHECK: %[[VAL_36:.*]] = icmp eq i64 %[[VAL_26]], %[[VAL_8]] // CHECK: br i1 %[[VAL_36]], label %[[VAL_14]], label %[[VAL_24]] -// CHECK: ._crit_edge5: ; preds = %[[VAL_24]], %[[VAL_11]], %[[VAL_22]] +// CHECK: [[VAL_14]]: ; preds = %[[VAL_24]], %[[VAL_11]], %[[VAL_22]] // CHECK: %[[VAL_37:.*]] = phi i8* [ %[[VAL_13]], %[[VAL_11]] ], [ %[[VAL_23]], %[[VAL_22]] ], [ %[[VAL_23]], %[[VAL_24]] ] // CHECK: %[[VAL_38:.*]] = call i8* @__nvqpp_vectorCopyCtor(i8* nonnull %[[VAL_37]], i64 %[[VAL_8]], i64 1) // CHECK: call void @__quantum__rt__qubit_release_array(%[[VAL_7]]* %[[VAL_6]]) -// CHECK: %[[VAL_39:.*]] = bitcast i8* %[[VAL_4]] to i8** -// CHECK: store i8* %[[VAL_38]], i8** %[[VAL_39]], align 8 -// CHECK: %[[VAL_40:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 16 -// CHECK: %[[VAL_41:.*]] = bitcast i8* %[[VAL_40]] to i64* -// CHECK: store i64 %[[VAL_8]], i64* %[[VAL_41]], align 4 +// CHECK: %[[VAL_50:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8 +// CHECK: %[[VAL_51:.*]] = bitcast i8* %[[VAL_50]] to i8** +// CHECK: store i8* %[[VAL_38]], i8** %[[VAL_51]], align 8 +// CHECK: %[[VAL_52:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 16 +// CHECK: %[[VAL_53:.*]] = bitcast i8* %[[VAL_52]] to i64* +// CHECK: store i64 %[[VAL_8]], i64* %[[VAL_53]], align 8 // CHECK: br i1 %[[VAL_1]], label %[[VAL_42:.*]], label %[[VAL_43:.*]] -// CHECK: common.ret: ; preds = %[[VAL_14]], %[[VAL_42]] +// CHECK: [[VAL_43]]: ; preds = %[[VAL_14]], %[[VAL_42]] // CHECK: %[[VAL_44:.*]] = phi { i8*, i64 } [ %[[VAL_45:.*]], %[[VAL_42]] ], [ zeroinitializer, %[[VAL_14]] ] // CHECK: ret { i8*, i64 } %[[VAL_44]] -// CHECK: 32: ; preds = %[[VAL_14]] +// CHECK: [[VAL_42]]: ; preds = %[[VAL_14]] // CHECK: %[[VAL_46:.*]] = add i64 %[[VAL_8]], 24 // CHECK: %[[VAL_47:.*]] = call i8* @malloc(i64 %[[VAL_46]]) // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 1 dereferenceable(24) %[[VAL_47]], i8* noundef nonnull align 1 dereferenceable(24) %[[VAL_0]], i64 24, i1 false) diff --git a/test/Quake/return_vector.qke b/test/Quake/return_vector.qke index 23a718bcc5..a13d0b6abe 100644 --- a/test/Quake/return_vector.qke +++ b/test/Quake/return_vector.qke @@ -29,16 +29,12 @@ func.func @test_0(%0: !cc.ptr, !cc.ptr, !cc.ptr, i64}>> {llvm.sret = !cc.struct<{!cc.ptr, i64}>}, %[[VAL_1:.*]]: i32) { +// CHECK-SAME: %[[VAL_1:.*]]: i32) -> !cc.stdvec { // CHECK-DAG: %[[VAL_2:.*]] = arith.constant 8 : i64 // CHECK-DAG: %[[VAL_3:.*]] = arith.constant 256 : i64 // CHECK: %[[VAL_4:.*]] = call @malloc(%[[VAL_3]]) : (i64) -> !cc.ptr -// CHECK: %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr) -> !cc.ptr -// CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, i64}>>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_5]], %[[VAL_9]] : !cc.ptr> -// CHECK: %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_0]][1] : (!cc.ptr, i64}>>) -> !cc.ptr -// CHECK: cc.store %[[VAL_2]], %[[VAL_7]] : !cc.ptr -// CHECK: return +// CHECK: %[[VAL_5:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_2]] : (!cc.ptr, i64) -> !cc.stdvec +// CHECK: return %[[VAL_5]] : !cc.stdvec // CHECK: } // CHECK-LABEL: func.func @test_0( @@ -93,15 +89,11 @@ func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr, i64}>> {llvm.sret = !cc.struct<{!cc.ptr, i64}>}, %[[VAL_1:.*]]: i32) { +// CHECK-SAME: %[[VAL_1:.*]]: i32) -> !cc.stdvec { // CHECK: %[[VAL_2:.*]] = arith.constant 9 : i64 // CHECK: %[[VAL_3:.*]] = arith.constant 520 : i64 // CHECK: %[[VAL_4:.*]] = call @malloc(%[[VAL_3]]) : (i64) -> !cc.ptr -// CHECK: %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr) -> !cc.ptr -// CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, i64}>>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_5]], %[[VAL_8]] : !cc.ptr> -// CHECK: %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_0]][1] : (!cc.ptr, i64}>>) -> !cc.ptr -// CHECK: cc.store %[[VAL_2]], %[[VAL_7]] : !cc.ptr +// CHECK: %[[VAL_5:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_2]] : (!cc.ptr, i64) -> !cc.stdvec // CHECK: return // CHECK: } @@ -151,13 +143,13 @@ func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr) -> !cc.ptr, i64}>}>> // CHECK: %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr, i64}>}>> // CHECK: %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 -// CHECK: %[[VAL_8:.*]] = cc.compute_ptr %[[VAL_3]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> -// CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr, i64}>>) -> !cc.ptr, i64}>> // CHECK: %[[VAL_10:.*]] = cc.extract_value %[[VAL_4]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>) -> i32 -// CHECK: call @__nvqpp__mlirgen__test_0(%[[VAL_9]], %[[VAL_10]]) : (!cc.ptr, i64}>>, i32) -> () +// CHECK: %[[VAL_15:.*]] = call @__nvqpp__mlirgen__test_0(%[[VAL_10]]) : (i32) -> !cc.stdvec // CHECK: cf.cond_br %[[VAL_1]], ^bb1, ^bb2 // CHECK: ^bb1: // CHECK: %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_3]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}> +// CHECK: %[[VAL_16:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr, i64}>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_15]], %[[VAL_16]] : !cc.ptr> // CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr, i64}>>) -> !cc.ptr, i64}>> // CHECK: %[[VAL_13:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_7]], %[[VAL_12]]) : (!cc.ptr, i64, !cc.ptr, i64}>>) -> !cc.struct<{!cc.ptr, i64}> // CHECK: return %[[VAL_13]] : !cc.struct<{!cc.ptr, i64}> @@ -171,13 +163,13 @@ func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr) -> !cc.ptr, i64}>}>> // CHECK: %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr, i64}>}>> // CHECK: %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 -// CHECK: %[[VAL_8:.*]] = cc.compute_ptr %[[VAL_3]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> -// CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr, i64}>>) -> !cc.ptr, i64}>> // CHECK: %[[VAL_10:.*]] = cc.extract_value %[[VAL_4]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>) -> i32 -// CHECK: call @__nvqpp__mlirgen__test_1(%[[VAL_9]], %[[VAL_10]]) : (!cc.ptr, i64}>>, i32) -> () +// CHECK: %[[VAL_15:.*]] = call @__nvqpp__mlirgen__test_1(%[[VAL_10]]) : (i32) -> !cc.stdvec // CHECK: cf.cond_br %[[VAL_1]], ^bb1, ^bb2 // CHECK: ^bb1: // CHECK: %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_3]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: %[[VAL_16:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr, i64}>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_15]], %[[VAL_16]] : !cc.ptr> // CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr, i64}>>) -> !cc.ptr, i64}>> // CHECK: %[[VAL_13:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_7]], %[[VAL_12]]) : (!cc.ptr, i64, !cc.ptr, i64}>>) -> !cc.struct<{!cc.ptr, i64}> // CHECK: return %[[VAL_13]] : !cc.struct<{!cc.ptr, i64}>