[core, runtime] Modify the launchers to support returning results. (#…

…2277) * Modify the launchers to support returning results. The launchers ceased really supporting dynamic kernel results at some point. This PR adds that functionality back into the mix. This support is added pervasively across the runtime library calls. Some notes: - Return values of static size continue to be supported as they were before. Specifically, these values are stored into the data buffer by the thunk adaptor so they can be returned, ultimately, to the original caller. - Return values of dynamic size follow exactly 1 of 2 possible calling conventions. The convention must be selected by the runtime layers. 1. Everything is running within a single process; i.e., this is a simulation. In this case, the kernel will create a span of data and that span will be returned to the original caller which will use it to construct the std::vector result object. 2. There are multiple processes and/or memory spaces involved. The result span will be appended to the original data packet and the new data packet will be returned as a new span by the runtime. The calling code will follow a similar process, but the data will be passed in the runtime in a pointer-free encoding. Make the cast more robust to sneak it past -Werror. Update another launchKernel override. Add some doxygen goop to try an evade CI issues. Fix the python builder errors. Signed-off-by: Eric Schweitz <[email protected]> * Sachin's fix. Signed-off-by: Eric Schweitz <[email protected]> * Fix build. Signed-off-by: Eric Schweitz <[email protected]> --------- Signed-off-by: Eric Schweitz <[email protected]>
NVIDIA · Oct 18, 2024 · 4fe3c4b · 4fe3c4b
1 parent 1a05616
commit 4fe3c4b
Show file tree

Hide file tree

Showing 25 changed files with 627 additions and 349 deletions.
diff --git a/docs/sphinx/api/languages/cpp_api.rst b/docs/sphinx/api/languages/cpp_api.rst
@@ -190,6 +190,10 @@ Platform
 
 .. doxygentypedef:: cudaq::KernelExecutionTask
 
+.. doxygenstruct:: cudaq::KernelThunkResultType
+
+.. doxygentypedef:: cudaq::KernelThunkType
+
 Utilities
 =========
 

diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -232,9 +232,14 @@ static constexpr IntrinsicCode intrinsicTable[] = {
   })#"},
 
     {"__nvqpp_createDynamicResult",
+     /* arguments:
+          arg0: original buffer ptr
+          arg1: original buffer size
+          arg2: ptr to span of the return data: {ptr, bytes}
+          arg3: offset to result slot in buffer */
      {cudaq::llvmMemCopyIntrinsic, "malloc"},
      R"#(
-  func.func private @__nvqpp_createDynamicResult(%arg0: !cc.ptr<i8>, %arg1: i64, %arg2: !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.struct<{!cc.ptr<i8>, i64}> {
+  func.func private @__nvqpp_createDynamicResult(%arg0: !cc.ptr<i8>, %arg1: i64, %arg2: !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>, %arg3: i64) -> !cc.struct<{!cc.ptr<i8>, i64}> {
     %0 = cc.compute_ptr %arg2[1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<i64>
     %1 = cc.load %0 : !cc.ptr<i64>
     %2 = arith.addi %arg1, %1 : i64
@@ -249,6 +254,9 @@ static constexpr IntrinsicCode intrinsicTable[] = {
     %7 = cc.undef !cc.struct<{!cc.ptr<i8>, i64}>
     %8 = cc.insert_value %3, %7[0] : (!cc.struct<{!cc.ptr<i8>, i64}>, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
     %9 = cc.insert_value %2, %8[1] : (!cc.struct<{!cc.ptr<i8>, i64}>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
+    %11 = cc.compute_ptr %10[%arg3] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+    %12 = cc.cast %11 : (!cc.ptr<i8>) -> !cc.ptr<!cc.ptr<i8>>
+    cc.store %6, %12 : !cc.ptr<!cc.ptr<i8>>
     return %9 : !cc.struct<{!cc.ptr<i8>, i64}>
   })#"},
 
@@ -319,7 +327,7 @@ static constexpr IntrinsicCode intrinsicTable[] = {
     {cudaq::runtime::launchKernelFuncName,
      {},
      R"#(
-  func.func private @altLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ())#"},
+  func.func private @altLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>)#"},
 
     {cudaq::runtime::CudaqRegisterArgsCreator,
      {},
@@ -346,7 +354,7 @@ static constexpr IntrinsicCode intrinsicTable[] = {
     {cudaq::runtime::launchKernelHybridFuncName,
      {},
      R"#(
-  func.func private @hybridLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> ())#"},
+  func.func private @hybridLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>)#"},
 
     {cudaq::llvmMemCopyIntrinsic, // llvm.memcpy.p0i8.p0i8.i64
      {},

diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp
@@ -48,6 +48,12 @@ static bool isCodegenArgumentGather(std::size_t kind) {
   return kind == 0 || kind == 2;
 }
 
+/// This pass adds a `<kernel name>.thunk` function and a rewritten C++ host
+/// side (mangled) stub to the code for every entry-point kernel in the module.
+/// It may also generate a `<kernel name>.argsCreator` function. Finally, it
+/// creates registration hooks for the CUDA-Q runtime to be able to find the
+/// kernel by name and, as appropriate, the `<kernel name>.argsCreator`
+/// function.
 namespace {
 class GenerateKernelExecution
     : public cudaq::opt::impl::GenerateKernelExecutionBase<
@@ -57,6 +63,19 @@ class GenerateKernelExecution
 
   /// Creates the function signature for a thunk function. The signature is
   /// always the same for all thunk functions.
+  ///
+  /// Every thunk function has an identical signature, making it callable from a
+  /// generic "kernel launcher" in the CUDA-Q runtime.
+  ///
+  /// This signature is defined as: `(ptr, bool) -> {ptr, i64}`.
+  ///
+  /// The first argument is a pointer to a data buffer that encodes all the
+  /// arguments (and static return) values to (and from) the kernel in the
+  /// pointer-free encoding. The second argument indicates if this call is to a
+  /// remote process (if true). The result is a pointer and size (span) if the
+  /// kernel returns a dynamically sized result, otherwise it will be
+  /// `{nullptr, 0}`. It is the responsibility of calling code to free any
+  /// dynamic result buffer(s) and convert those to `std::vector` objects.
   FunctionType getThunkType(MLIRContext *ctx) {
     auto ptrTy = cudaq::cc::PointerType::get(IntegerType::get(ctx, 8));
     return FunctionType::get(ctx, {ptrTy, IntegerType::get(ctx, 1)},
@@ -769,31 +788,32 @@ class GenerateKernelExecution
       auto *thenBlock = builder.createBlock(reg);
       auto *elseBlock = builder.createBlock(reg);
       builder.setInsertionPointToEnd(currentBlock);
+      auto eleTy = structTy.getMember(offset);
+      auto memTy = cudaq::cc::PointerType::get(eleTy);
+      auto mem = builder.create<cudaq::cc::ComputePtrOp>(
+          loc, memTy, castOp, SmallVector<cudaq::cc::ComputePtrArg>{offset});
+      auto resPtrTy = cudaq::cc::PointerType::get(call.getResult(0).getType());
+      auto castMem = builder.create<cudaq::cc::CastOp>(loc, resPtrTy, mem);
+      builder.create<cudaq::cc::StoreOp>(loc, call.getResult(0), castMem);
       builder.create<cf::CondBranchOp>(loc, isClientServer, thenBlock,
                                        elseBlock);
       builder.setInsertionPointToEnd(thenBlock);
-      auto gepRes = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, cudaq::cc::PointerType::get(structTy.getMember(offset)), castOp,
-          ArrayRef<cudaq::cc::ComputePtrArg>{offset});
-      auto resAsVec = builder.create<cudaq::cc::CastOp>(
-          loc, cudaq::cc::PointerType::get(funcTy.getResult(0)), gepRes);
-      builder.create<cudaq::cc::StoreOp>(loc, call.getResult(0), resAsVec);
       auto resAsArg = builder.create<cudaq::cc::CastOp>(
-          loc, cudaq::cc::PointerType::get(thunkTy.getResults()[0]), gepRes);
-      // createDynamicResult packs the input values and the dynamic results
-      // into a single buffer to pass back as a message.
+          loc, cudaq::cc::PointerType::get(thunkTy.getResults()[0]), mem);
+      auto retOffset = genComputeReturnOffset(loc, builder, funcTy, structTy);
+      // createDynamicResult allocates a new buffer and packs the input values
+      // and the dynamic results into this single new buffer to pass back as a
+      // message.
+      // NB: This code only handles one dimensional vectors of static types. It
+      // will have to be changed if there is a need to return recursively
+      // dynamic structures, i.e., vectors of vectors.
       auto res = builder.create<func::CallOp>(
           loc, thunkTy.getResults()[0], "__nvqpp_createDynamicResult",
-          ValueRange{thunkEntry->getArgument(0), structSize, resAsArg});
+          ValueRange{thunkEntry->getArgument(0), structSize, resAsArg,
+                     retOffset});
       builder.create<func::ReturnOp>(loc, res.getResult(0));
       builder.setInsertionPointToEnd(elseBlock);
-      auto eleTy = structTy.getMember(offset);
-      auto memTy = cudaq::cc::PointerType::get(eleTy);
-      auto mem = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, memTy, castOp, SmallVector<cudaq::cc::ComputePtrArg>{offset});
-      auto resPtrTy = cudaq::cc::PointerType::get(call.getResult(0).getType());
-      auto castMem = builder.create<cudaq::cc::CastOp>(loc, resPtrTy, mem);
-      builder.create<cudaq::cc::StoreOp>(loc, call.getResult(0), castMem);
+      // For the else case, the span was already copied to the block.
     } else {
       // FIXME: Should check for recursive vector case.
       // If the kernel returns non-dynamic results (no spans), then take those
@@ -854,8 +874,6 @@ class GenerateKernelExecution
     auto ptrPtrTy = cudaq::cc::PointerType::get(ptrTy);
     auto sret0 = builder.create<cudaq::cc::ComputePtrOp>(
         loc, ptrPtrTy, castSret, SmallVector<cudaq::cc::ComputePtrArg>{0});
-    Value vecPtr = builder.create<cudaq::cc::LoadOp>(loc, ptrTy, sret0);
-    builder.create<func::CallOp>(loc, std::nullopt, "free", ValueRange{vecPtr});
     auto arrI8Ty = cudaq::cc::ArrayType::get(i8Ty);
     auto ptrArrTy = cudaq::cc::PointerType::get(arrI8Ty);
     auto buffPtr0 = builder.create<cudaq::cc::CastOp>(loc, ptrTy, data);
@@ -1338,21 +1356,72 @@ class GenerateKernelExecution
     auto castLoadKernName =
         builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, loadKernName);
 
+    auto hostFuncTy = hostFunc.getFunctionType();
+    assert((hostFuncTy.getResults().empty() ||
+            (hostFuncTy.getNumResults() == 1)) &&
+           "C++ function expected to have 0 or 1 return value");
+    const bool resultVal = !hostFuncTy.getResults().empty();
+    const bool kernelReturnsValue =
+        resultVal || cudaq::opt::factory::hasSRet(hostFunc);
+    Value launchResult;
+    Value launchResultToFree;
+    auto decodeLaunchResults = [&](Value spanReturned) {
+      if (!kernelReturnsValue)
+        return;
+      Type res0Ty = structTy.getMember(offset);
+      auto ptrResTy = cudaq::cc::PointerType::get(res0Ty);
+      auto rptr = builder.create<cudaq::cc::ExtractValueOp>(loc, ptrI8Ty,
+                                                            spanReturned, 0);
+      launchResultToFree = rptr;
+      auto rIntPtr = builder.create<cudaq::cc::CastOp>(loc, i64Ty, rptr);
+      auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
+      auto cmp = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne,
+                                               rIntPtr, zero);
+      auto *currentBlock = builder.getBlock();
+      auto *reg = currentBlock->getParent();
+      auto *thenBlock = builder.createBlock(reg);
+      auto *elseBlock = builder.createBlock(reg);
+      auto *endifBlock = builder.createBlock(
+          reg, reg->end(), TypeRange{ptrResTy}, SmallVector<Location>(1, loc));
+      builder.setInsertionPointToEnd(currentBlock);
+      builder.create<cf::CondBranchOp>(loc, cmp, thenBlock, elseBlock);
+      builder.setInsertionPointToEnd(thenBlock);
+      // dynamic result was returned.
+      // We need to free() this buffer before the end of this function.
+      auto rStructPtr =
+          builder.create<cudaq::cc::CastOp>(loc, structPtrTy, rptr);
+      Value lRes = builder.create<cudaq::cc::ComputePtrOp>(
+          loc, ptrResTy, rStructPtr,
+          ArrayRef<cudaq::cc::ComputePtrArg>{offset});
+      builder.create<cf::BranchOp>(loc, endifBlock, ArrayRef<Value>{lRes});
+      builder.setInsertionPointToEnd(elseBlock);
+      // span was returned in the original buffer.
+      Value mRes = builder.create<cudaq::cc::ComputePtrOp>(
+          loc, ptrResTy, temp, ArrayRef<cudaq::cc::ComputePtrArg>{0, offset});
+      builder.create<cf::BranchOp>(loc, endifBlock, ArrayRef<Value>{mRes});
+      builder.setInsertionPointToEnd(endifBlock);
+      launchResult = endifBlock->getArgument(0);
+    };
+
     // Generate the call to `launchKernel`.
     switch (codegenKind) {
     case 0: {
       assert(vecArgPtrs && castLoadThunk);
-      builder.create<func::CallOp>(
-          loc, std::nullopt, cudaq::runtime::launchKernelHybridFuncName,
+      auto launch = builder.create<func::CallOp>(
+          loc, cudaq::opt::factory::getDynamicBufferType(ctx),
+          cudaq::runtime::launchKernelHybridFuncName,
           ArrayRef<Value>{castLoadKernName, castLoadThunk, castTemp,
                           extendedStructSize, resultOffset, vecArgPtrs});
+      decodeLaunchResults(launch.getResult(0));
     } break;
     case 1: {
       assert(!vecArgPtrs && castLoadThunk);
-      builder.create<func::CallOp>(
-          loc, std::nullopt, cudaq::runtime::launchKernelFuncName,
+      auto launch = builder.create<func::CallOp>(
+          loc, cudaq::opt::factory::getDynamicBufferType(ctx),
+          cudaq::runtime::launchKernelFuncName,
           ArrayRef<Value>{castLoadKernName, castLoadThunk, castTemp,
                           extendedStructSize, resultOffset});
+      decodeLaunchResults(launch.getResult(0));
     } break;
     case 2: {
       assert(vecArgPtrs && !castLoadThunk);
@@ -1377,17 +1446,13 @@ class GenerateKernelExecution
     // result value(s) from the struct returned by `launchKernel` and return
     // them to our caller.
     SmallVector<Value> results;
-    auto hostFuncTy = hostFunc.getFunctionType();
-    assert((hostFuncTy.getResults().empty() ||
-            (hostFuncTy.getNumResults() == 1)) &&
-           "C++ function expected to have 0 or 1 return value");
-    const bool resultVal = !hostFuncTy.getResults().empty();
-    if (resultVal || cudaq::opt::factory::hasSRet(hostFunc)) {
+    if (kernelReturnsValue) {
+      Type res0Ty = structTy.getMember(offset);
+      auto ptrResTy = cudaq::cc::PointerType::get(res0Ty);
       // Host function returns a value. Either returning by value or via an sret
       // reference.
       if (resultVal) {
-        Type res0Ty = structTy.getMember(offset);
-        auto ptrResTy = cudaq::cc::PointerType::get(res0Ty);
+        // Static values. std::vector are necessarily sret, see below.
         auto resPtr = builder.create<cudaq::cc::ComputePtrOp>(
             loc, ptrResTy, temp, ArrayRef<cudaq::cc::ComputePtrArg>{0, offset});
         Type castToTy = cudaq::cc::PointerType::get(hostFuncTy.getResult(0));
@@ -1398,22 +1463,22 @@ class GenerateKernelExecution
         }();
         results.push_back(builder.create<cudaq::cc::LoadOp>(loc, castResPtr));
       } else {
-        // Check if device is returning a span. If it is, then we will need to
-        // convert it to a std::vector here. The vector is constructed in-place
-        // on the sret memory block.
+        // This is an sret return. Check if device is returning a span. If it
+        // is, then we will need to convert it to a std::vector here. The vector
+        // is constructed in-place on the sret memory block.
         Value arg0 = hostFuncEntryBlock->getArguments().front();
         if (auto spanTy =
                 dyn_cast<cudaq::cc::SpanLikeType>(devFuncTy.getResult(0))) {
           auto eleTy = spanTy.getElementType();
           auto ptrTy = cudaq::cc::PointerType::get(eleTy);
           auto gep0 = builder.create<cudaq::cc::ComputePtrOp>(
-              loc, cudaq::cc::PointerType::get(ptrTy), temp,
-              SmallVector<cudaq::cc::ComputePtrArg>{0, offset, 0});
+              loc, cudaq::cc::PointerType::get(ptrTy), launchResult,
+              SmallVector<cudaq::cc::ComputePtrArg>{0});
           auto dataPtr = builder.create<cudaq::cc::LoadOp>(loc, gep0);
           auto lenPtrTy = cudaq::cc::PointerType::get(i64Ty);
           auto gep1 = builder.create<cudaq::cc::ComputePtrOp>(
-              loc, lenPtrTy, temp,
-              SmallVector<cudaq::cc::ComputePtrArg>{0, offset, 1});
+              loc, lenPtrTy, launchResult,
+              SmallVector<cudaq::cc::ComputePtrArg>{1});
           auto vecLen = builder.create<cudaq::cc::LoadOp>(loc, gep1);
           if (spanTy.getElementType() == builder.getI1Type()) {
             genStdvecBoolFromInitList(loc, builder, arg0, dataPtr, vecLen);
@@ -1422,13 +1487,14 @@ class GenerateKernelExecution
                 builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, eleTy);
             genStdvecTFromInitList(loc, builder, arg0, dataPtr, tSize, vecLen);
           }
+          // free(nullptr) is defined to be a nop in the standard.
+          builder.create<func::CallOp>(loc, std::nullopt, "free",
+                                       ArrayRef<Value>{launchResultToFree});
         } else {
           // Otherwise, we can just copy the aggregate into the sret memory
           // block. Uses the size of the host function's sret pointer element
           // type for the memcpy, so the device should return an (aggregate)
           // value of suitable size.
-          Type res0Ty = structTy.getMember(offset);
-          auto ptrResTy = cudaq::cc::PointerType::get(res0Ty);
           auto resPtr = builder.create<cudaq::cc::ComputePtrOp>(
               loc, ptrResTy, temp,
               ArrayRef<cudaq::cc::ComputePtrArg>{0, offset});