Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[core, runtime] Modify the launchers to support returning results. #2277

Merged
merged 3 commits into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/sphinx/api/languages/cpp_api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,10 @@ Platform

.. doxygentypedef:: cudaq::KernelExecutionTask

.. doxygenstruct:: cudaq::KernelThunkResultType

.. doxygentypedef:: cudaq::KernelThunkType

Utilities
=========

Expand Down
14 changes: 11 additions & 3 deletions lib/Optimizer/Builder/Intrinsics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,9 +232,14 @@ static constexpr IntrinsicCode intrinsicTable[] = {
})#"},

{"__nvqpp_createDynamicResult",
/* arguments:
arg0: original buffer ptr
arg1: original buffer size
arg2: ptr to span of the return data: {ptr, bytes}
arg3: offset to result slot in buffer */
{cudaq::llvmMemCopyIntrinsic, "malloc"},
R"#(
func.func private @__nvqpp_createDynamicResult(%arg0: !cc.ptr<i8>, %arg1: i64, %arg2: !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.struct<{!cc.ptr<i8>, i64}> {
func.func private @__nvqpp_createDynamicResult(%arg0: !cc.ptr<i8>, %arg1: i64, %arg2: !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>, %arg3: i64) -> !cc.struct<{!cc.ptr<i8>, i64}> {
%0 = cc.compute_ptr %arg2[1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<i64>
%1 = cc.load %0 : !cc.ptr<i64>
%2 = arith.addi %arg1, %1 : i64
Expand All @@ -249,6 +254,9 @@ static constexpr IntrinsicCode intrinsicTable[] = {
%7 = cc.undef !cc.struct<{!cc.ptr<i8>, i64}>
%8 = cc.insert_value %3, %7[0] : (!cc.struct<{!cc.ptr<i8>, i64}>, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
%9 = cc.insert_value %2, %8[1] : (!cc.struct<{!cc.ptr<i8>, i64}>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
%11 = cc.compute_ptr %10[%arg3] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
%12 = cc.cast %11 : (!cc.ptr<i8>) -> !cc.ptr<!cc.ptr<i8>>
cc.store %6, %12 : !cc.ptr<!cc.ptr<i8>>
return %9 : !cc.struct<{!cc.ptr<i8>, i64}>
})#"},

Expand Down Expand Up @@ -319,7 +327,7 @@ static constexpr IntrinsicCode intrinsicTable[] = {
{cudaq::runtime::launchKernelFuncName,
{},
R"#(
func.func private @altLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ())#"},
func.func private @altLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>)#"},

{cudaq::runtime::CudaqRegisterArgsCreator,
{},
Expand All @@ -346,7 +354,7 @@ static constexpr IntrinsicCode intrinsicTable[] = {
{cudaq::runtime::launchKernelHybridFuncName,
{},
R"#(
func.func private @hybridLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> ())#"},
func.func private @hybridLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>)#"},

{cudaq::llvmMemCopyIntrinsic, // llvm.memcpy.p0i8.p0i8.i64
{},
Expand Down
146 changes: 106 additions & 40 deletions lib/Optimizer/Transforms/GenKernelExecution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@ static bool isCodegenArgumentGather(std::size_t kind) {
return kind == 0 || kind == 2;
}

/// This pass adds a `<kernel name>.thunk` function and a rewritten C++ host
/// side (mangled) stub to the code for every entry-point kernel in the module.
/// It may also generate a `<kernel name>.argsCreator` function. Finally, it
/// creates registration hooks for the CUDA-Q runtime to be able to find the
/// kernel by name and, as appropriate, the `<kernel name>.argsCreator`
/// function.
namespace {
class GenerateKernelExecution
: public cudaq::opt::impl::GenerateKernelExecutionBase<
Expand All @@ -57,6 +63,19 @@ class GenerateKernelExecution

/// Creates the function signature for a thunk function. The signature is
/// always the same for all thunk functions.
///
/// Every thunk function has an identical signature, making it callable from a
/// generic "kernel launcher" in the CUDA-Q runtime.
///
/// This signature is defined as: `(ptr, bool) -> {ptr, i64}`.
///
/// The first argument is a pointer to a data buffer that encodes all the
/// arguments (and static return) values to (and from) the kernel in the
/// pointer-free encoding. The second argument indicates if this call is to a
/// remote process (if true). The result is a pointer and size (span) if the
/// kernel returns a dynamically sized result, otherwise it will be
/// `{nullptr, 0}`. It is the responsibility of calling code to free any
/// dynamic result buffer(s) and convert those to `std::vector` objects.
FunctionType getThunkType(MLIRContext *ctx) {
auto ptrTy = cudaq::cc::PointerType::get(IntegerType::get(ctx, 8));
return FunctionType::get(ctx, {ptrTy, IntegerType::get(ctx, 1)},
Expand Down Expand Up @@ -769,31 +788,32 @@ class GenerateKernelExecution
auto *thenBlock = builder.createBlock(reg);
auto *elseBlock = builder.createBlock(reg);
builder.setInsertionPointToEnd(currentBlock);
auto eleTy = structTy.getMember(offset);
auto memTy = cudaq::cc::PointerType::get(eleTy);
auto mem = builder.create<cudaq::cc::ComputePtrOp>(
loc, memTy, castOp, SmallVector<cudaq::cc::ComputePtrArg>{offset});
auto resPtrTy = cudaq::cc::PointerType::get(call.getResult(0).getType());
auto castMem = builder.create<cudaq::cc::CastOp>(loc, resPtrTy, mem);
builder.create<cudaq::cc::StoreOp>(loc, call.getResult(0), castMem);
builder.create<cf::CondBranchOp>(loc, isClientServer, thenBlock,
elseBlock);
builder.setInsertionPointToEnd(thenBlock);
auto gepRes = builder.create<cudaq::cc::ComputePtrOp>(
loc, cudaq::cc::PointerType::get(structTy.getMember(offset)), castOp,
ArrayRef<cudaq::cc::ComputePtrArg>{offset});
auto resAsVec = builder.create<cudaq::cc::CastOp>(
loc, cudaq::cc::PointerType::get(funcTy.getResult(0)), gepRes);
builder.create<cudaq::cc::StoreOp>(loc, call.getResult(0), resAsVec);
auto resAsArg = builder.create<cudaq::cc::CastOp>(
loc, cudaq::cc::PointerType::get(thunkTy.getResults()[0]), gepRes);
// createDynamicResult packs the input values and the dynamic results
// into a single buffer to pass back as a message.
loc, cudaq::cc::PointerType::get(thunkTy.getResults()[0]), mem);
auto retOffset = genComputeReturnOffset(loc, builder, funcTy, structTy);
// createDynamicResult allocates a new buffer and packs the input values
// and the dynamic results into this single new buffer to pass back as a
// message.
// NB: This code only handles one dimensional vectors of static types. It
// will have to be changed if there is a need to return recursively
// dynamic structures, i.e., vectors of vectors.
auto res = builder.create<func::CallOp>(
loc, thunkTy.getResults()[0], "__nvqpp_createDynamicResult",
ValueRange{thunkEntry->getArgument(0), structSize, resAsArg});
ValueRange{thunkEntry->getArgument(0), structSize, resAsArg,
retOffset});
builder.create<func::ReturnOp>(loc, res.getResult(0));
builder.setInsertionPointToEnd(elseBlock);
auto eleTy = structTy.getMember(offset);
auto memTy = cudaq::cc::PointerType::get(eleTy);
auto mem = builder.create<cudaq::cc::ComputePtrOp>(
loc, memTy, castOp, SmallVector<cudaq::cc::ComputePtrArg>{offset});
auto resPtrTy = cudaq::cc::PointerType::get(call.getResult(0).getType());
auto castMem = builder.create<cudaq::cc::CastOp>(loc, resPtrTy, mem);
builder.create<cudaq::cc::StoreOp>(loc, call.getResult(0), castMem);
// For the else case, the span was already copied to the block.
} else {
// FIXME: Should check for recursive vector case.
// If the kernel returns non-dynamic results (no spans), then take those
Expand Down Expand Up @@ -854,8 +874,6 @@ class GenerateKernelExecution
auto ptrPtrTy = cudaq::cc::PointerType::get(ptrTy);
auto sret0 = builder.create<cudaq::cc::ComputePtrOp>(
loc, ptrPtrTy, castSret, SmallVector<cudaq::cc::ComputePtrArg>{0});
Value vecPtr = builder.create<cudaq::cc::LoadOp>(loc, ptrTy, sret0);
builder.create<func::CallOp>(loc, std::nullopt, "free", ValueRange{vecPtr});
auto arrI8Ty = cudaq::cc::ArrayType::get(i8Ty);
auto ptrArrTy = cudaq::cc::PointerType::get(arrI8Ty);
auto buffPtr0 = builder.create<cudaq::cc::CastOp>(loc, ptrTy, data);
Expand Down Expand Up @@ -1338,21 +1356,72 @@ class GenerateKernelExecution
auto castLoadKernName =
builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, loadKernName);

auto hostFuncTy = hostFunc.getFunctionType();
assert((hostFuncTy.getResults().empty() ||
(hostFuncTy.getNumResults() == 1)) &&
"C++ function expected to have 0 or 1 return value");
const bool resultVal = !hostFuncTy.getResults().empty();
const bool kernelReturnsValue =
resultVal || cudaq::opt::factory::hasSRet(hostFunc);
Value launchResult;
Value launchResultToFree;
auto decodeLaunchResults = [&](Value spanReturned) {
if (!kernelReturnsValue)
return;
Type res0Ty = structTy.getMember(offset);
auto ptrResTy = cudaq::cc::PointerType::get(res0Ty);
auto rptr = builder.create<cudaq::cc::ExtractValueOp>(loc, ptrI8Ty,
spanReturned, 0);
launchResultToFree = rptr;
auto rIntPtr = builder.create<cudaq::cc::CastOp>(loc, i64Ty, rptr);
auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
auto cmp = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne,
rIntPtr, zero);
auto *currentBlock = builder.getBlock();
auto *reg = currentBlock->getParent();
auto *thenBlock = builder.createBlock(reg);
auto *elseBlock = builder.createBlock(reg);
auto *endifBlock = builder.createBlock(
reg, reg->end(), TypeRange{ptrResTy}, SmallVector<Location>(1, loc));
builder.setInsertionPointToEnd(currentBlock);
builder.create<cf::CondBranchOp>(loc, cmp, thenBlock, elseBlock);
builder.setInsertionPointToEnd(thenBlock);
// dynamic result was returned.
// We need to free() this buffer before the end of this function.
auto rStructPtr =
builder.create<cudaq::cc::CastOp>(loc, structPtrTy, rptr);
Value lRes = builder.create<cudaq::cc::ComputePtrOp>(
loc, ptrResTy, rStructPtr,
ArrayRef<cudaq::cc::ComputePtrArg>{offset});
builder.create<cf::BranchOp>(loc, endifBlock, ArrayRef<Value>{lRes});
builder.setInsertionPointToEnd(elseBlock);
// span was returned in the original buffer.
Value mRes = builder.create<cudaq::cc::ComputePtrOp>(
loc, ptrResTy, temp, ArrayRef<cudaq::cc::ComputePtrArg>{0, offset});
builder.create<cf::BranchOp>(loc, endifBlock, ArrayRef<Value>{mRes});
builder.setInsertionPointToEnd(endifBlock);
launchResult = endifBlock->getArgument(0);
};

// Generate the call to `launchKernel`.
switch (codegenKind) {
case 0: {
assert(vecArgPtrs && castLoadThunk);
builder.create<func::CallOp>(
loc, std::nullopt, cudaq::runtime::launchKernelHybridFuncName,
auto launch = builder.create<func::CallOp>(
loc, cudaq::opt::factory::getDynamicBufferType(ctx),
cudaq::runtime::launchKernelHybridFuncName,
ArrayRef<Value>{castLoadKernName, castLoadThunk, castTemp,
extendedStructSize, resultOffset, vecArgPtrs});
decodeLaunchResults(launch.getResult(0));
} break;
case 1: {
assert(!vecArgPtrs && castLoadThunk);
builder.create<func::CallOp>(
loc, std::nullopt, cudaq::runtime::launchKernelFuncName,
auto launch = builder.create<func::CallOp>(
loc, cudaq::opt::factory::getDynamicBufferType(ctx),
cudaq::runtime::launchKernelFuncName,
ArrayRef<Value>{castLoadKernName, castLoadThunk, castTemp,
extendedStructSize, resultOffset});
decodeLaunchResults(launch.getResult(0));
} break;
case 2: {
assert(vecArgPtrs && !castLoadThunk);
Expand All @@ -1377,17 +1446,13 @@ class GenerateKernelExecution
// result value(s) from the struct returned by `launchKernel` and return
// them to our caller.
SmallVector<Value> results;
auto hostFuncTy = hostFunc.getFunctionType();
assert((hostFuncTy.getResults().empty() ||
(hostFuncTy.getNumResults() == 1)) &&
"C++ function expected to have 0 or 1 return value");
const bool resultVal = !hostFuncTy.getResults().empty();
if (resultVal || cudaq::opt::factory::hasSRet(hostFunc)) {
if (kernelReturnsValue) {
Type res0Ty = structTy.getMember(offset);
auto ptrResTy = cudaq::cc::PointerType::get(res0Ty);
// Host function returns a value. Either returning by value or via an sret
// reference.
if (resultVal) {
Type res0Ty = structTy.getMember(offset);
auto ptrResTy = cudaq::cc::PointerType::get(res0Ty);
// Static values. std::vector are necessarily sret, see below.
auto resPtr = builder.create<cudaq::cc::ComputePtrOp>(
loc, ptrResTy, temp, ArrayRef<cudaq::cc::ComputePtrArg>{0, offset});
Type castToTy = cudaq::cc::PointerType::get(hostFuncTy.getResult(0));
Expand All @@ -1398,22 +1463,22 @@ class GenerateKernelExecution
}();
results.push_back(builder.create<cudaq::cc::LoadOp>(loc, castResPtr));
} else {
// Check if device is returning a span. If it is, then we will need to
// convert it to a std::vector here. The vector is constructed in-place
// on the sret memory block.
// This is an sret return. Check if device is returning a span. If it
// is, then we will need to convert it to a std::vector here. The vector
// is constructed in-place on the sret memory block.
Value arg0 = hostFuncEntryBlock->getArguments().front();
if (auto spanTy =
dyn_cast<cudaq::cc::SpanLikeType>(devFuncTy.getResult(0))) {
auto eleTy = spanTy.getElementType();
auto ptrTy = cudaq::cc::PointerType::get(eleTy);
auto gep0 = builder.create<cudaq::cc::ComputePtrOp>(
loc, cudaq::cc::PointerType::get(ptrTy), temp,
SmallVector<cudaq::cc::ComputePtrArg>{0, offset, 0});
loc, cudaq::cc::PointerType::get(ptrTy), launchResult,
SmallVector<cudaq::cc::ComputePtrArg>{0});
auto dataPtr = builder.create<cudaq::cc::LoadOp>(loc, gep0);
auto lenPtrTy = cudaq::cc::PointerType::get(i64Ty);
auto gep1 = builder.create<cudaq::cc::ComputePtrOp>(
loc, lenPtrTy, temp,
SmallVector<cudaq::cc::ComputePtrArg>{0, offset, 1});
loc, lenPtrTy, launchResult,
SmallVector<cudaq::cc::ComputePtrArg>{1});
auto vecLen = builder.create<cudaq::cc::LoadOp>(loc, gep1);
if (spanTy.getElementType() == builder.getI1Type()) {
genStdvecBoolFromInitList(loc, builder, arg0, dataPtr, vecLen);
Expand All @@ -1422,13 +1487,14 @@ class GenerateKernelExecution
builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, eleTy);
genStdvecTFromInitList(loc, builder, arg0, dataPtr, tSize, vecLen);
}
// free(nullptr) is defined to be a nop in the standard.
builder.create<func::CallOp>(loc, std::nullopt, "free",
ArrayRef<Value>{launchResultToFree});
} else {
// Otherwise, we can just copy the aggregate into the sret memory
// block. Uses the size of the host function's sret pointer element
// type for the memcpy, so the device should return an (aggregate)
// value of suitable size.
Type res0Ty = structTy.getMember(offset);
auto ptrResTy = cudaq::cc::PointerType::get(res0Ty);
auto resPtr = builder.create<cudaq::cc::ComputePtrOp>(
loc, ptrResTy, temp,
ArrayRef<cudaq::cc::ComputePtrArg>{0, offset});
Expand Down
Loading
Loading