diff --git a/docs/sphinx/api/languages/cpp_api.rst b/docs/sphinx/api/languages/cpp_api.rst
index 1a26fb2d41..7f4728199e 100644
--- a/docs/sphinx/api/languages/cpp_api.rst
+++ b/docs/sphinx/api/languages/cpp_api.rst
@@ -190,6 +190,10 @@ Platform
 
 .. doxygentypedef:: cudaq::KernelExecutionTask
 
+.. doxygenstruct:: cudaq::KernelThunkResultType
+
+.. doxygentypedef:: cudaq::KernelThunkType
+
 Utilities
 =========
 
diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
index 12e430dc03..a6cc0ae477 100644
--- a/lib/Optimizer/Builder/Intrinsics.cpp
+++ b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -232,9 +232,14 @@ static constexpr IntrinsicCode intrinsicTable[] = {
   })#"},
 
     {"__nvqpp_createDynamicResult",
+     /* arguments:
+          arg0: original buffer ptr
+          arg1: original buffer size
+          arg2: ptr to span of the return data: {ptr, bytes}
+          arg3: offset to result slot in buffer */
      {cudaq::llvmMemCopyIntrinsic, "malloc"},
      R"#(
-  func.func private @__nvqpp_createDynamicResult(%arg0: !cc.ptr<i8>, %arg1: i64, %arg2: !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.struct<{!cc.ptr<i8>, i64}> {
+  func.func private @__nvqpp_createDynamicResult(%arg0: !cc.ptr<i8>, %arg1: i64, %arg2: !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>, %arg3: i64) -> !cc.struct<{!cc.ptr<i8>, i64}> {
     %0 = cc.compute_ptr %arg2[1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<i64>
     %1 = cc.load %0 : !cc.ptr<i64>
     %2 = arith.addi %arg1, %1 : i64
@@ -249,6 +254,9 @@ static constexpr IntrinsicCode intrinsicTable[] = {
     %7 = cc.undef !cc.struct<{!cc.ptr<i8>, i64}>
     %8 = cc.insert_value %3, %7[0] : (!cc.struct<{!cc.ptr<i8>, i64}>, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
     %9 = cc.insert_value %2, %8[1] : (!cc.struct<{!cc.ptr<i8>, i64}>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
+    %11 = cc.compute_ptr %10[%arg3] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+    %12 = cc.cast %11 : (!cc.ptr<i8>) -> !cc.ptr<!cc.ptr<i8>>
+    cc.store %6, %12 : !cc.ptr<!cc.ptr<i8>>
     return %9 : !cc.struct<{!cc.ptr<i8>, i64}>
   })#"},
 
@@ -319,7 +327,7 @@ static constexpr IntrinsicCode intrinsicTable[] = {
     {cudaq::runtime::launchKernelFuncName,
      {},
      R"#(
-  func.func private @altLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ())#"},
+  func.func private @altLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>)#"},
 
     {cudaq::runtime::CudaqRegisterArgsCreator,
      {},
@@ -346,7 +354,7 @@ static constexpr IntrinsicCode intrinsicTable[] = {
     {cudaq::runtime::launchKernelHybridFuncName,
      {},
      R"#(
-  func.func private @hybridLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> ())#"},
+  func.func private @hybridLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>)#"},
 
     {cudaq::llvmMemCopyIntrinsic, // llvm.memcpy.p0i8.p0i8.i64
      {},
diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp
index a4667ce7b5..2e45c8df96 100644
--- a/lib/Optimizer/Transforms/GenKernelExecution.cpp
+++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp
@@ -48,6 +48,12 @@ static bool isCodegenArgumentGather(std::size_t kind) {
   return kind == 0 || kind == 2;
 }
 
+/// This pass adds a `<kernel name>.thunk` function and a rewritten C++ host
+/// side (mangled) stub to the code for every entry-point kernel in the module.
+/// It may also generate a `<kernel name>.argsCreator` function. Finally, it
+/// creates registration hooks for the CUDA-Q runtime to be able to find the
+/// kernel by name and, as appropriate, the `<kernel name>.argsCreator`
+/// function.
 namespace {
 class GenerateKernelExecution
     : public cudaq::opt::impl::GenerateKernelExecutionBase<
@@ -57,6 +63,19 @@ class GenerateKernelExecution
 
   /// Creates the function signature for a thunk function. The signature is
   /// always the same for all thunk functions.
+  ///
+  /// Every thunk function has an identical signature, making it callable from a
+  /// generic "kernel launcher" in the CUDA-Q runtime.
+  ///
+  /// This signature is defined as: `(ptr, bool) -> {ptr, i64}`.
+  ///
+  /// The first argument is a pointer to a data buffer that encodes all the
+  /// arguments (and static return) values to (and from) the kernel in the
+  /// pointer-free encoding. The second argument indicates if this call is to a
+  /// remote process (if true). The result is a pointer and size (span) if the
+  /// kernel returns a dynamically sized result, otherwise it will be
+  /// `{nullptr, 0}`. It is the responsibility of calling code to free any
+  /// dynamic result buffer(s) and convert those to `std::vector` objects.
   FunctionType getThunkType(MLIRContext *ctx) {
     auto ptrTy = cudaq::cc::PointerType::get(IntegerType::get(ctx, 8));
     return FunctionType::get(ctx, {ptrTy, IntegerType::get(ctx, 1)},
@@ -769,31 +788,32 @@ class GenerateKernelExecution
       auto *thenBlock = builder.createBlock(reg);
       auto *elseBlock = builder.createBlock(reg);
       builder.setInsertionPointToEnd(currentBlock);
+      auto eleTy = structTy.getMember(offset);
+      auto memTy = cudaq::cc::PointerType::get(eleTy);
+      auto mem = builder.create<cudaq::cc::ComputePtrOp>(
+          loc, memTy, castOp, SmallVector<cudaq::cc::ComputePtrArg>{offset});
+      auto resPtrTy = cudaq::cc::PointerType::get(call.getResult(0).getType());
+      auto castMem = builder.create<cudaq::cc::CastOp>(loc, resPtrTy, mem);
+      builder.create<cudaq::cc::StoreOp>(loc, call.getResult(0), castMem);
       builder.create<cf::CondBranchOp>(loc, isClientServer, thenBlock,
                                        elseBlock);
       builder.setInsertionPointToEnd(thenBlock);
-      auto gepRes = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, cudaq::cc::PointerType::get(structTy.getMember(offset)), castOp,
-          ArrayRef<cudaq::cc::ComputePtrArg>{offset});
-      auto resAsVec = builder.create<cudaq::cc::CastOp>(
-          loc, cudaq::cc::PointerType::get(funcTy.getResult(0)), gepRes);
-      builder.create<cudaq::cc::StoreOp>(loc, call.getResult(0), resAsVec);
       auto resAsArg = builder.create<cudaq::cc::CastOp>(
-          loc, cudaq::cc::PointerType::get(thunkTy.getResults()[0]), gepRes);
-      // createDynamicResult packs the input values and the dynamic results
-      // into a single buffer to pass back as a message.
+          loc, cudaq::cc::PointerType::get(thunkTy.getResults()[0]), mem);
+      auto retOffset = genComputeReturnOffset(loc, builder, funcTy, structTy);
+      // createDynamicResult allocates a new buffer and packs the input values
+      // and the dynamic results into this single new buffer to pass back as a
+      // message.
+      // NB: This code only handles one dimensional vectors of static types. It
+      // will have to be changed if there is a need to return recursively
+      // dynamic structures, i.e., vectors of vectors.
       auto res = builder.create<func::CallOp>(
           loc, thunkTy.getResults()[0], "__nvqpp_createDynamicResult",
-          ValueRange{thunkEntry->getArgument(0), structSize, resAsArg});
+          ValueRange{thunkEntry->getArgument(0), structSize, resAsArg,
+                     retOffset});
       builder.create<func::ReturnOp>(loc, res.getResult(0));
       builder.setInsertionPointToEnd(elseBlock);
-      auto eleTy = structTy.getMember(offset);
-      auto memTy = cudaq::cc::PointerType::get(eleTy);
-      auto mem = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, memTy, castOp, SmallVector<cudaq::cc::ComputePtrArg>{offset});
-      auto resPtrTy = cudaq::cc::PointerType::get(call.getResult(0).getType());
-      auto castMem = builder.create<cudaq::cc::CastOp>(loc, resPtrTy, mem);
-      builder.create<cudaq::cc::StoreOp>(loc, call.getResult(0), castMem);
+      // For the else case, the span was already copied to the block.
     } else {
       // FIXME: Should check for recursive vector case.
       // If the kernel returns non-dynamic results (no spans), then take those
@@ -854,8 +874,6 @@ class GenerateKernelExecution
     auto ptrPtrTy = cudaq::cc::PointerType::get(ptrTy);
     auto sret0 = builder.create<cudaq::cc::ComputePtrOp>(
         loc, ptrPtrTy, castSret, SmallVector<cudaq::cc::ComputePtrArg>{0});
-    Value vecPtr = builder.create<cudaq::cc::LoadOp>(loc, ptrTy, sret0);
-    builder.create<func::CallOp>(loc, std::nullopt, "free", ValueRange{vecPtr});
     auto arrI8Ty = cudaq::cc::ArrayType::get(i8Ty);
     auto ptrArrTy = cudaq::cc::PointerType::get(arrI8Ty);
     auto buffPtr0 = builder.create<cudaq::cc::CastOp>(loc, ptrTy, data);
@@ -1338,21 +1356,72 @@ class GenerateKernelExecution
     auto castLoadKernName =
         builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, loadKernName);
 
+    auto hostFuncTy = hostFunc.getFunctionType();
+    assert((hostFuncTy.getResults().empty() ||
+            (hostFuncTy.getNumResults() == 1)) &&
+           "C++ function expected to have 0 or 1 return value");
+    const bool resultVal = !hostFuncTy.getResults().empty();
+    const bool kernelReturnsValue =
+        resultVal || cudaq::opt::factory::hasSRet(hostFunc);
+    Value launchResult;
+    Value launchResultToFree;
+    auto decodeLaunchResults = [&](Value spanReturned) {
+      if (!kernelReturnsValue)
+        return;
+      Type res0Ty = structTy.getMember(offset);
+      auto ptrResTy = cudaq::cc::PointerType::get(res0Ty);
+      auto rptr = builder.create<cudaq::cc::ExtractValueOp>(loc, ptrI8Ty,
+                                                            spanReturned, 0);
+      launchResultToFree = rptr;
+      auto rIntPtr = builder.create<cudaq::cc::CastOp>(loc, i64Ty, rptr);
+      auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
+      auto cmp = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne,
+                                               rIntPtr, zero);
+      auto *currentBlock = builder.getBlock();
+      auto *reg = currentBlock->getParent();
+      auto *thenBlock = builder.createBlock(reg);
+      auto *elseBlock = builder.createBlock(reg);
+      auto *endifBlock = builder.createBlock(
+          reg, reg->end(), TypeRange{ptrResTy}, SmallVector<Location>(1, loc));
+      builder.setInsertionPointToEnd(currentBlock);
+      builder.create<cf::CondBranchOp>(loc, cmp, thenBlock, elseBlock);
+      builder.setInsertionPointToEnd(thenBlock);
+      // dynamic result was returned.
+      // We need to free() this buffer before the end of this function.
+      auto rStructPtr =
+          builder.create<cudaq::cc::CastOp>(loc, structPtrTy, rptr);
+      Value lRes = builder.create<cudaq::cc::ComputePtrOp>(
+          loc, ptrResTy, rStructPtr,
+          ArrayRef<cudaq::cc::ComputePtrArg>{offset});
+      builder.create<cf::BranchOp>(loc, endifBlock, ArrayRef<Value>{lRes});
+      builder.setInsertionPointToEnd(elseBlock);
+      // span was returned in the original buffer.
+      Value mRes = builder.create<cudaq::cc::ComputePtrOp>(
+          loc, ptrResTy, temp, ArrayRef<cudaq::cc::ComputePtrArg>{0, offset});
+      builder.create<cf::BranchOp>(loc, endifBlock, ArrayRef<Value>{mRes});
+      builder.setInsertionPointToEnd(endifBlock);
+      launchResult = endifBlock->getArgument(0);
+    };
+
     // Generate the call to `launchKernel`.
     switch (codegenKind) {
     case 0: {
       assert(vecArgPtrs && castLoadThunk);
-      builder.create<func::CallOp>(
-          loc, std::nullopt, cudaq::runtime::launchKernelHybridFuncName,
+      auto launch = builder.create<func::CallOp>(
+          loc, cudaq::opt::factory::getDynamicBufferType(ctx),
+          cudaq::runtime::launchKernelHybridFuncName,
           ArrayRef<Value>{castLoadKernName, castLoadThunk, castTemp,
                           extendedStructSize, resultOffset, vecArgPtrs});
+      decodeLaunchResults(launch.getResult(0));
     } break;
     case 1: {
       assert(!vecArgPtrs && castLoadThunk);
-      builder.create<func::CallOp>(
-          loc, std::nullopt, cudaq::runtime::launchKernelFuncName,
+      auto launch = builder.create<func::CallOp>(
+          loc, cudaq::opt::factory::getDynamicBufferType(ctx),
+          cudaq::runtime::launchKernelFuncName,
           ArrayRef<Value>{castLoadKernName, castLoadThunk, castTemp,
                           extendedStructSize, resultOffset});
+      decodeLaunchResults(launch.getResult(0));
     } break;
     case 2: {
       assert(vecArgPtrs && !castLoadThunk);
@@ -1377,17 +1446,13 @@ class GenerateKernelExecution
     // result value(s) from the struct returned by `launchKernel` and return
     // them to our caller.
     SmallVector<Value> results;
-    auto hostFuncTy = hostFunc.getFunctionType();
-    assert((hostFuncTy.getResults().empty() ||
-            (hostFuncTy.getNumResults() == 1)) &&
-           "C++ function expected to have 0 or 1 return value");
-    const bool resultVal = !hostFuncTy.getResults().empty();
-    if (resultVal || cudaq::opt::factory::hasSRet(hostFunc)) {
+    if (kernelReturnsValue) {
+      Type res0Ty = structTy.getMember(offset);
+      auto ptrResTy = cudaq::cc::PointerType::get(res0Ty);
       // Host function returns a value. Either returning by value or via an sret
       // reference.
       if (resultVal) {
-        Type res0Ty = structTy.getMember(offset);
-        auto ptrResTy = cudaq::cc::PointerType::get(res0Ty);
+        // Static values. std::vector are necessarily sret, see below.
         auto resPtr = builder.create<cudaq::cc::ComputePtrOp>(
             loc, ptrResTy, temp, ArrayRef<cudaq::cc::ComputePtrArg>{0, offset});
         Type castToTy = cudaq::cc::PointerType::get(hostFuncTy.getResult(0));
@@ -1398,22 +1463,22 @@ class GenerateKernelExecution
         }();
         results.push_back(builder.create<cudaq::cc::LoadOp>(loc, castResPtr));
       } else {
-        // Check if device is returning a span. If it is, then we will need to
-        // convert it to a std::vector here. The vector is constructed in-place
-        // on the sret memory block.
+        // This is an sret return. Check if device is returning a span. If it
+        // is, then we will need to convert it to a std::vector here. The vector
+        // is constructed in-place on the sret memory block.
         Value arg0 = hostFuncEntryBlock->getArguments().front();
         if (auto spanTy =
                 dyn_cast<cudaq::cc::SpanLikeType>(devFuncTy.getResult(0))) {
           auto eleTy = spanTy.getElementType();
           auto ptrTy = cudaq::cc::PointerType::get(eleTy);
           auto gep0 = builder.create<cudaq::cc::ComputePtrOp>(
-              loc, cudaq::cc::PointerType::get(ptrTy), temp,
-              SmallVector<cudaq::cc::ComputePtrArg>{0, offset, 0});
+              loc, cudaq::cc::PointerType::get(ptrTy), launchResult,
+              SmallVector<cudaq::cc::ComputePtrArg>{0});
           auto dataPtr = builder.create<cudaq::cc::LoadOp>(loc, gep0);
           auto lenPtrTy = cudaq::cc::PointerType::get(i64Ty);
           auto gep1 = builder.create<cudaq::cc::ComputePtrOp>(
-              loc, lenPtrTy, temp,
-              SmallVector<cudaq::cc::ComputePtrArg>{0, offset, 1});
+              loc, lenPtrTy, launchResult,
+              SmallVector<cudaq::cc::ComputePtrArg>{1});
           auto vecLen = builder.create<cudaq::cc::LoadOp>(loc, gep1);
           if (spanTy.getElementType() == builder.getI1Type()) {
             genStdvecBoolFromInitList(loc, builder, arg0, dataPtr, vecLen);
@@ -1422,13 +1487,14 @@ class GenerateKernelExecution
                 builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, eleTy);
             genStdvecTFromInitList(loc, builder, arg0, dataPtr, tSize, vecLen);
           }
+          // free(nullptr) is defined to be a nop in the standard.
+          builder.create<func::CallOp>(loc, std::nullopt, "free",
+                                       ArrayRef<Value>{launchResultToFree});
         } else {
           // Otherwise, we can just copy the aggregate into the sret memory
           // block. Uses the size of the host function's sret pointer element
           // type for the memcpy, so the device should return an (aggregate)
           // value of suitable size.
-          Type res0Ty = structTy.getMember(offset);
-          auto ptrResTy = cudaq::cc::PointerType::get(res0Ty);
           auto resPtr = builder.create<cudaq::cc::ComputePtrOp>(
               loc, ptrResTy, temp,
               ArrayRef<cudaq::cc::ComputePtrArg>{0, offset});
diff --git a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
index 9328b78896..689be49998 100644
--- a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
+++ b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
@@ -72,19 +72,17 @@ LogicalResult genVectorOfConstantsFromAttributes(cudaq::IRBuilder irBuilder,
         return success();
       }
     }
-  } else if (auto floatTy = dyn_cast<FloatType>(eleTy)) {
-    if (floatTy == irBuilder.getF64Type()) {
-      auto vals = readConstantValues<double>(values, floatTy);
-      if (vals.size() == values.size()) {
-        irBuilder.genVectorOfConstants(loc, module, name, vals);
-        return success();
-      }
-    } else if (floatTy == irBuilder.getF32Type()) {
-      auto vals = readConstantValues<float>(values, floatTy);
-      if (vals.size() == values.size()) {
-        irBuilder.genVectorOfConstants(loc, module, name, vals);
-        return success();
-      }
+  } else if (eleTy == irBuilder.getF64Type()) {
+    auto vals = readConstantValues<double>(values, eleTy);
+    if (vals.size() == values.size()) {
+      irBuilder.genVectorOfConstants(loc, module, name, vals);
+      return success();
+    }
+  } else if (eleTy == irBuilder.getF32Type()) {
+    auto vals = readConstantValues<float>(values, eleTy);
+    if (vals.size() == values.size()) {
+      irBuilder.genVectorOfConstants(loc, module, name, vals);
+      return success();
     }
   }
   return failure();
@@ -147,7 +145,9 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
           rewriter.create<cudaq::cc::ConstantArrayOp>(loc, arrTy, valuesAttr);
     }
 
-    SmallVector<Operation *> toErase;
+    assert(conArr && "must have created the constant array");
+    LLVM_DEBUG(llvm::dbgs() << "constant array is:\n" << conArr << '\n');
+    bool cannotEraseAlloc = false;
 
     // Rewalk all the uses of alloc, u, which must be cc.cast or cc.compute_ptr.
     // For each,u, remove a store and replace a load with a cc.extract_value.
@@ -176,8 +176,9 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
           continue;
         }
         if (isa<cudaq::cc::StoreOp>(useuser))
-          toErase.push_back(useuser);
-        isLive = true;
+          rewriter.eraseOp(useuser);
+        LLVM_DEBUG(llvm::dbgs() << "alloc is live\n");
+        cannotEraseAlloc = isLive = true;
       }
       if (auto ist = dyn_cast<quake::InitializeStateOp>(user)) {
         rewriter.setInsertionPointAfter(user);
@@ -188,20 +189,20 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
         continue;
       }
       if (!isLive)
-        toErase.push_back(user);
-    }
-    if (toGlobal) {
-      if (conGlobal) {
-        rewriter.setInsertionPointAfter(alloc);
-        rewriter.replaceOp(alloc, conGlobal);
-      }
-    } else {
-      toErase.push_back(alloc);
+        rewriter.eraseOp(user);
     }
 
-    for (auto *op : toErase)
-      rewriter.eraseOp(op);
-
+    if (toGlobal && conGlobal) {
+      rewriter.setInsertionPointAfter(alloc);
+      rewriter.replaceOp(alloc, conGlobal);
+      return success();
+    }
+    if (cannotEraseAlloc) {
+      rewriter.setInsertionPointAfter(alloc);
+      rewriter.create<cudaq::cc::StoreOp>(loc, conArr, alloc);
+      return success();
+    }
+    rewriter.eraseOp(alloc);
     return success();
   }
 
@@ -305,12 +306,16 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
         }
         // Process casts that are used in quake.init_state.
         if (cast.getType() == ptrUnsizedArrTy) {
-          if (getWriteOp(cast, 0))
-            LLVM_DEBUG(
-                llvm::dbgs()
-                << "unexpected use of array size removing cast in a store"
-                << *op << '\n');
-          continue;
+          if (cast->hasOneUse()) {
+            auto &use = *cast->getUses().begin();
+            Operation *u = use.getOwner();
+            if (isa_and_present<quake::InitializeStateOp>(u)) {
+              toGlobalUses.push_back(op);
+              toGlobal = true;
+              continue;
+            }
+          }
+          return false;
         }
         LLVM_DEBUG(llvm::dbgs() << "unexpected cast: " << *op << '\n');
         toGlobalUses.push_back(op);
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index b91627de9f..5a197f97a6 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -254,7 +254,7 @@ pyAltLaunchKernelBase(const std::string &name, MlirModule module,
   if (!thunkPtr)
     throw std::runtime_error("cudaq::builder failed to get thunk function");
 
-  auto thunk = reinterpret_cast<void (*)(void *)>(*thunkPtr);
+  auto thunk = reinterpret_cast<KernelThunkType>(*thunkPtr);
 
   std::string properName = name;
 
@@ -327,15 +327,21 @@ pyAltLaunchKernelBase(const std::string &name, MlirModule module,
 
   if (launch) {
     auto &platform = cudaq::get_platform();
+    auto uReturnOffset = static_cast<std::uint64_t>(returnOffset);
     if (platform.is_remote() || platform.is_emulated()) {
       auto *wrapper = new cudaq::ArgWrapper{mod, names, rawArgs};
-      cudaq::altLaunchKernel(name.c_str(), thunk,
-                             reinterpret_cast<void *>(wrapper), size,
-                             (uint64_t)returnOffset);
+      auto dynamicResult = cudaq::altLaunchKernel(
+          name.c_str(), thunk, reinterpret_cast<void *>(wrapper), size,
+          uReturnOffset);
+      if (dynamicResult.data_buffer || dynamicResult.size)
+        throw std::runtime_error("not implemented: support dynamic results");
       delete wrapper;
-    } else
-      cudaq::altLaunchKernel(name.c_str(), thunk, rawArgs, size,
-                             (uint64_t)returnOffset);
+    } else {
+      auto dynamicResult = cudaq::altLaunchKernel(name.c_str(), thunk, rawArgs,
+                                                  size, uReturnOffset);
+      if (dynamicResult.data_buffer || dynamicResult.size)
+        throw std::runtime_error("not implemented: support dynamic results");
+    }
   }
 
   return std::make_tuple(rawArgs, size, returnOffset);
diff --git a/python/runtime/utils/PyRemoteSimulatorQPU.cpp b/python/runtime/utils/PyRemoteSimulatorQPU.cpp
index 4cc998c363..f767bb652a 100644
--- a/python/runtime/utils/PyRemoteSimulatorQPU.cpp
+++ b/python/runtime/utils/PyRemoteSimulatorQPU.cpp
@@ -130,16 +130,19 @@ class PyRemoteSimulatorQPU : public cudaq::BaseRemoteSimulatorQPU {
                     kernelArgs, gradient, H, optimizer, n_params, shots);
   }
 
-  void launchKernel(const std::string &name, void (*kernelFunc)(void *),
-                    void *args, std::uint64_t voidStarSize,
-                    std::uint64_t resultOffset,
-                    const std::vector<void *> &rawArgs) override {
+  cudaq::KernelThunkResultType
+  launchKernel(const std::string &name, cudaq::KernelThunkType kernelFunc,
+               void *args, std::uint64_t voidStarSize,
+               std::uint64_t resultOffset,
+               const std::vector<void *> &rawArgs) override {
     cudaq::info("PyRemoteSimulatorQPU: Launch kernel named '{}' remote QPU {} "
                 "(simulator = {})",
                 name, qpu_id, m_simName);
     ::launchKernelImpl(getExecutionContextForMyThread(), m_client, m_simName,
-                       name, kernelFunc, args, voidStarSize, resultOffset,
-                       rawArgs);
+                       name, make_degenerate_kernel_type(kernelFunc), args,
+                       voidStarSize, resultOffset, rawArgs);
+    // TODO: Python should probably support return values too.
+    return {};
   }
 
   void launchKernel(const std::string &name,
@@ -178,16 +181,19 @@ class PyNvcfSimulatorQPU : public cudaq::BaseNvcfSimulatorQPU {
                     kernelArgs, gradient, H, optimizer, n_params, shots);
   }
 
-  void launchKernel(const std::string &name, void (*kernelFunc)(void *),
-                    void *args, std::uint64_t voidStarSize,
-                    std::uint64_t resultOffset,
-                    const std::vector<void *> &rawArgs) override {
+  cudaq::KernelThunkResultType
+  launchKernel(const std::string &name, cudaq::KernelThunkType kernelFunc,
+               void *args, std::uint64_t voidStarSize,
+               std::uint64_t resultOffset,
+               const std::vector<void *> &rawArgs) override {
     cudaq::info("PyNvcfSimulatorQPU: Launch kernel named '{}' remote QPU {} "
                 "(simulator = {})",
                 name, qpu_id, m_simName);
     ::launchKernelImpl(getExecutionContextForMyThread(), m_client, m_simName,
-                       name, kernelFunc, args, voidStarSize, resultOffset,
-                       rawArgs);
+                       name, make_degenerate_kernel_type(kernelFunc), args,
+                       voidStarSize, resultOffset, rawArgs);
+    // TODO: Python should probably support return values too.
+    return {};
   }
 
   void launchKernel(const std::string &name,
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 61c26dc791..c65e94c3b6 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -578,10 +578,11 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
   /// the representation required by the targeted backend. Handle all pertinent
   /// modifications for the execution context as well as asynchronous or
   /// synchronous invocation.
-  void launchKernel(const std::string &kernelName, void (*kernelFunc)(void *),
-                    void *args, std::uint64_t voidStarSize,
-                    std::uint64_t resultOffset,
-                    const std::vector<void *> &rawArgs) override {
+  KernelThunkResultType
+  launchKernel(const std::string &kernelName, KernelThunkType kernelFunc,
+               void *args, std::uint64_t voidStarSize,
+               std::uint64_t resultOffset,
+               const std::vector<void *> &rawArgs) override {
     cudaq::info("launching remote rest kernel ({})", kernelName);
 
     // TODO future iterations of this should support non-void return types.
@@ -597,6 +598,9 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     auto codes = rawArgs.empty() ? lowerQuakeCode(kernelName, args)
                                  : lowerQuakeCode(kernelName, rawArgs);
     completeLaunchKernel(kernelName, std::move(codes));
+
+    // NB: Kernel should/will never return dynamic results.
+    return {};
   }
 
   void completeLaunchKernel(const std::string &kernelName,
diff --git a/runtime/common/BaseRemoteSimulatorQPU.h b/runtime/common/BaseRemoteSimulatorQPU.h
index 667fba5941..6260f334c1 100644
--- a/runtime/common/BaseRemoteSimulatorQPU.h
+++ b/runtime/common/BaseRemoteSimulatorQPU.h
@@ -107,22 +107,24 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU {
 
   void launchKernel(const std::string &name,
                     const std::vector<void *> &rawArgs) override {
-    launchKernelImpl(name, nullptr, nullptr, 0, 0, &rawArgs);
+    [[maybe_unused]] auto dynamicResult =
+        launchKernelImpl(name, nullptr, nullptr, 0, 0, &rawArgs);
   }
 
-  void launchKernel(const std::string &name, void (*kernelFunc)(void *),
-                    void *args, std::uint64_t voidStarSize,
-                    std::uint64_t resultOffset,
-                    const std::vector<void *> &rawArgs) override {
+  KernelThunkResultType
+  launchKernel(const std::string &name, KernelThunkType kernelFunc, void *args,
+               std::uint64_t voidStarSize, std::uint64_t resultOffset,
+               const std::vector<void *> &rawArgs) override {
     // Remote simulation cannot deal with rawArgs. Drop them on the floor.
-    launchKernelImpl(name, kernelFunc, args, voidStarSize, resultOffset,
-                     nullptr);
+    return launchKernelImpl(name, kernelFunc, args, voidStarSize, resultOffset,
+                            nullptr);
   }
 
-  void launchKernelImpl(const std::string &name, void (*kernelFunc)(void *),
-                        void *args, std::uint64_t voidStarSize,
-                        std::uint64_t resultOffset,
-                        const std::vector<void *> *rawArgs) {
+  [[nodiscard]] KernelThunkResultType
+  launchKernelImpl(const std::string &name, KernelThunkType kernelFunc,
+                   void *args, std::uint64_t voidStarSize,
+                   std::uint64_t resultOffset,
+                   const std::vector<void *> *rawArgs) {
     cudaq::info(
         "BaseRemoteSimulatorQPU: Launch kernel named '{}' remote QPU {} "
         "(simulator = {})",
@@ -132,7 +134,7 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU {
         getExecutionContextForMyThread();
 
     if (executionContextPtr && executionContextPtr->name == "tracer") {
-      return;
+      return {};
     }
 
     // Default context for a 'fire-and-ignore' kernel launch; i.e., no context
@@ -155,7 +157,8 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU {
     const bool requestOkay = m_client->sendRequest(
         *m_mlirContext, executionContext, /*serializedCodeContext=*/nullptr,
         /*vqe_gradient=*/nullptr, /*vqe_optimizer=*/nullptr, /*vqe_n_params=*/0,
-        m_simName, name, kernelFunc, args, voidStarSize, &errorMsg, rawArgs);
+        m_simName, name, make_degenerate_kernel_type(kernelFunc), args,
+        voidStarSize, &errorMsg, rawArgs);
     if (!requestOkay)
       throw std::runtime_error("Failed to launch kernel. Error: " + errorMsg);
     if (isDirectInvocation &&
@@ -182,6 +185,9 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU {
                   executionContext.invocationResultBuffer.size());
       executionContext.invocationResultBuffer.clear();
     }
+
+    // Assumes kernel has no dynamic results. (Static result handled above.)
+    return {};
   }
 
   void
diff --git a/runtime/common/KernelWrapper.h b/runtime/common/KernelWrapper.h
index 6c30efa58d..2e82522d91 100644
--- a/runtime/common/KernelWrapper.h
+++ b/runtime/common/KernelWrapper.h
@@ -537,17 +537,19 @@ std::invoke_result_t<QuantumKernel, Args...> invokeKernel(QuantumKernel &&fn,
     // For raw function pointers, i.e., kernels described as free functions, we
     // send on the function pointer to the platform to retrieve the symbol name
     // since the typeid of a function only contains signature info.
-    if constexpr (std::is_class_v<std::decay_t<QuantumKernel>>)
+    if constexpr (std::is_class_v<std::decay_t<QuantumKernel>>) {
       // FIXME: this shouldn't use the serialization code any longer. It should
       // build a vector of void* and pass that instead.
       cudaq::get_platform().launchKernel(cudaq::getKernelName(fn), nullptr,
                                          (void *)serializedArgsBuffer.data(),
                                          serializedArgsBuffer.size(), 0, {});
-    else
+    } else {
       cudaq::get_platform().launchKernel(
-          cudaq::getKernelName(fn), reinterpret_cast<void (*)(void *)>(&fn),
+          cudaq::getKernelName(fn),
+          reinterpret_cast<cudaq::KernelThunkType>(&fn),
           (void *)serializedArgsBuffer.data(), serializedArgsBuffer.size(), 0,
           {});
+    }
   }
 #else
   return fn(std::forward<Args>(args)...);
diff --git a/runtime/common/ThunkInterface.h b/runtime/common/ThunkInterface.h
new file mode 100644
index 0000000000..05aeec37a3
--- /dev/null
+++ b/runtime/common/ThunkInterface.h
@@ -0,0 +1,44 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cstdint>
+
+namespace cudaq {
+
+/// A kernel may return results dynamically if the size of the result is not a
+/// constant at compile-time.
+struct KernelThunkResultType {
+  void *data_buffer;  ///< Pointer to the first element of an array.
+  std::uint64_t size; ///< The size of the buffer in bytes.
+};
+
+/// The universal signature of a kernel thunk.
+using KernelThunkType = KernelThunkResultType (*)(void *, bool);
+
+/// The degenerate form of a kernel call. In some launch cases, it may be
+/// predetermined that the kernel can be called without a thunk.
+using KernelDegenerateType = void (*)(void *);
+
+/// In some cases, the launcher will bypass the thunk function and call a
+/// degenerate stub. That means that the extra `bool` argument will be ignored
+/// by the called kernel and the kernel will not return a dynamic result.
+///
+/// This is a terrible idea, generally speaking. However, if the launcher
+/// neither looks for nor attempts to use the second `bool` argument at all, and
+/// the launcher will drop any results returned from the kernel (regardless of
+/// type) on the floor anyway, then one may be able to get away with using a
+/// degenerate kernel type.
+inline KernelDegenerateType
+make_degenerate_kernel_type(KernelThunkType func_type) {
+  return reinterpret_cast<KernelDegenerateType>(
+      reinterpret_cast<void *>(func_type));
+}
+
+} // namespace cudaq
diff --git a/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp b/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp
index 1b8d0b1141..df8a89e6f4 100644
--- a/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp
+++ b/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp
@@ -33,11 +33,12 @@ class DefaultQPU : public cudaq::QPU {
     execution_queue->enqueue(task);
   }
 
-  void launchKernel(const std::string &name, void (*kernelFunc)(void *),
-                    void *args, std::uint64_t, std::uint64_t,
-                    const std::vector<void *> &rawArgs) override {
+  cudaq::KernelThunkResultType
+  launchKernel(const std::string &name, cudaq::KernelThunkType kernelFunc,
+               void *args, std::uint64_t argsSize, std::uint64_t resultOffset,
+               const std::vector<void *> &rawArgs) override {
     ScopedTraceWithContext(cudaq::TIMING_LAUNCH, "QPU::launchKernel");
-    kernelFunc(args);
+    return kernelFunc(args, /*isRemote=*/false);
   }
 
   /// Overrides setExecutionContext to forward it to the ExecutionManager
diff --git a/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp b/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp
index 1243e9f480..38b26f2a98 100644
--- a/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp
+++ b/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp
@@ -37,12 +37,13 @@ class GPUEmulatedQPU : public cudaq::QPU {
     execution_queue->enqueue(task);
   }
 
-  void launchKernel(const std::string &name, void (*kernelFunc)(void *),
-                    void *args, std::uint64_t, std::uint64_t,
-                    const std::vector<void *> &rawArgs) override {
+  cudaq::KernelThunkResultType
+  launchKernel(const std::string &name, cudaq::KernelThunkType kernelFunc,
+               void *args, std::uint64_t, std::uint64_t,
+               const std::vector<void *> &rawArgs) override {
     cudaq::info("QPU::launchKernel GPU {}", qpu_id);
     cudaSetDevice(qpu_id);
-    kernelFunc(args);
+    return kernelFunc(args, /*differentMemorySpace=*/false);
   }
 
   /// Overrides setExecutionContext to forward it to the ExecutionManager
diff --git a/runtime/cudaq/platform/orca/OrcaQPU.cpp b/runtime/cudaq/platform/orca/OrcaQPU.cpp
index 63883a7af3..fdb053bf87 100644
--- a/runtime/cudaq/platform/orca/OrcaQPU.cpp
+++ b/runtime/cudaq/platform/orca/OrcaQPU.cpp
@@ -23,8 +23,8 @@ cudaq::sample_result runSampling(TBIParameters &parameters,
   platform.set_exec_ctx(ctx.get(), qpu_id);
   platform.set_current_qpu(qpu_id);
 
-  cudaq::altLaunchKernel("orca_launch", nullptr, &parameters,
-                         sizeof(TBIParameters), 0);
+  [[maybe_unused]] auto dynamicResult = cudaq::altLaunchKernel(
+      "orca_launch", nullptr, &parameters, sizeof(TBIParameters), 0);
 
   platform.reset_exec_ctx(qpu_id);
   return ctx->result;
@@ -43,8 +43,8 @@ async_sample_result runAsyncSampling(TBIParameters &parameters,
   platform.set_exec_ctx(ctx.get(), qpu_id);
   platform.set_current_qpu(qpu_id);
 
-  cudaq::altLaunchKernel("orca_launch", nullptr, &parameters,
-                         sizeof(TBIParameters), 0);
+  [[maybe_unused]] auto dynamicResult = cudaq::altLaunchKernel(
+      "orca_launch", nullptr, &parameters, sizeof(TBIParameters), 0);
 
   // If we have a non-null future, set it
   futureResult = ctx->futureResult;
diff --git a/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.cpp b/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.cpp
index f834136fc4..1c63c92c2b 100644
--- a/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.cpp
+++ b/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.cpp
@@ -59,11 +59,10 @@ void OrcaRemoteRESTQPU::setTargetBackend(const std::string &backend) {
 }
 
 /// @brief Launch the experiment.
-void OrcaRemoteRESTQPU::launchKernel(const std::string &kernelName,
-                                     void (*kernelFunc)(void *), void *args,
-                                     std::uint64_t voidStarSize,
-                                     std::uint64_t resultOffset,
-                                     const std::vector<void *> &rawArgs) {
+KernelThunkResultType OrcaRemoteRESTQPU::launchKernel(
+    const std::string &kernelName, KernelThunkType kernelFunc, void *args,
+    std::uint64_t voidStarSize, std::uint64_t resultOffset,
+    const std::vector<void *> &rawArgs) {
 
   cudaq::info("OrcaRemoteRESTQPU: Launch kernel named '{}' remote QPU {}",
               kernelName, qpu_id);
@@ -88,12 +87,15 @@ void OrcaRemoteRESTQPU::launchKernel(const std::string &kernelName,
   // Keep this asynchronous if requested
   if (ctx->asyncExec) {
     ctx->futureResult = future;
-    return;
+    return {};
   }
 
   // Otherwise make this synchronous
   ctx->result = future.get();
+
+  // TODO: support dynamic result types.
+  return {};
 }
 
 } // namespace cudaq
-CUDAQ_REGISTER_TYPE(cudaq::QPU, cudaq::OrcaRemoteRESTQPU, orca)
\ No newline at end of file
+CUDAQ_REGISTER_TYPE(cudaq::QPU, cudaq::OrcaRemoteRESTQPU, orca)
diff --git a/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.h b/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.h
index 80d2df5726..d14a5f4e3c 100644
--- a/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.h
+++ b/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.h
@@ -120,10 +120,11 @@ class OrcaRemoteRESTQPU : public cudaq::QPU {
 
   /// @brief Launch the kernel. Handle all pertinent modifications for the
   /// execution context.
-  void launchKernel(const std::string &kernelName, void (*kernelFunc)(void *),
-                    void *args, std::uint64_t voidStarSize,
-                    std::uint64_t resultOffset,
-                    const std::vector<void *> &rawArgs) override;
+  KernelThunkResultType
+  launchKernel(const std::string &kernelName, KernelThunkType kernelFunc,
+               void *args, std::uint64_t voidStarSize,
+               std::uint64_t resultOffset,
+               const std::vector<void *> &rawArgs) override;
   void launchKernel(const std::string &kernelName,
                     const std::vector<void *> &rawArgs) override {
     throw std::runtime_error("launch kernel on raw args not implemented");
diff --git a/runtime/cudaq/platform/qpu.h b/runtime/cudaq/platform/qpu.h
index 13a6d7da25..d104094a1e 100644
--- a/runtime/cudaq/platform/qpu.h
+++ b/runtime/cudaq/platform/qpu.h
@@ -11,12 +11,12 @@
 #include "QuantumExecutionQueue.h"
 #include "common/Logger.h"
 #include "common/Registry.h"
+#include "common/ThunkInterface.h"
 #include "common/Timing.h"
 #include "cudaq/qis/execution_manager.h"
 #include "cudaq/qis/qubit_qis.h"
 #include "cudaq/remote_capabilities.h"
 #include "cudaq/utils/cudaq_utils.h"
-
 #include <optional>
 
 namespace cudaq {
@@ -172,9 +172,10 @@ class QPU : public registry::RegisteredType<QPU> {
   /// Launch the kernel with given name (to extract its Quake representation).
   /// The raw function pointer is also provided, as are the runtime arguments,
   /// as a struct-packed void pointer and its corresponding size.
-  virtual void launchKernel(const std::string &name, void (*kernelFunc)(void *),
-                            void *args, std::uint64_t, std::uint64_t,
-                            const std::vector<void *> &rawArgs) = 0;
+  [[nodiscard]] virtual KernelThunkResultType
+  launchKernel(const std::string &name, KernelThunkType kernelFunc, void *args,
+               std::uint64_t, std::uint64_t,
+               const std::vector<void *> &rawArgs) = 0;
 
   /// Launch the kernel with given name and argument arrays.
   // This is intended for remote QPUs whereby we need to JIT-compile the kernel
diff --git a/runtime/cudaq/platform/quantum_platform.cpp b/runtime/cudaq/platform/quantum_platform.cpp
index 00e259c389..46f248c690 100644
--- a/runtime/cudaq/platform/quantum_platform.cpp
+++ b/runtime/cudaq/platform/quantum_platform.cpp
@@ -30,11 +30,11 @@ namespace cudaq {
 std::string get_quake(const std::string &);
 
 static quantum_platform *platform;
-inline static constexpr std::string_view GetQuantumPlatformSymbol =
+static constexpr std::string_view GetQuantumPlatformSymbol =
     "getQuantumPlatform";
 
 void setQuantumPlatformInternal(quantum_platform *p) {
-  cudaq::info("external caller setting the platform.");
+  info("external caller setting the platform.");
   platform = p;
 }
 
@@ -43,8 +43,8 @@ void setQuantumPlatformInternal(quantum_platform *p) {
 quantum_platform *getQuantumPlatformInternal() {
   if (platform)
     return platform;
-  platform = cudaq::getUniquePluginInstance<quantum_platform>(
-      GetQuantumPlatformSymbol);
+  platform =
+      getUniquePluginInstance<quantum_platform>(GetQuantumPlatformSymbol);
   return platform;
 }
 
@@ -94,8 +94,7 @@ std::size_t quantum_platform::get_current_qpu() { return platformCurrentQPU; }
 
 // Specify the execution context for this platform.
 // This delegates to the targeted QPU
-void quantum_platform::set_exec_ctx(cudaq::ExecutionContext *ctx,
-                                    std::size_t qid) {
+void quantum_platform::set_exec_ctx(ExecutionContext *ctx, std::size_t qid) {
   executionContext = ctx;
   auto &platformQPU = platformQPUs[qid];
   platformQPU->setExecutionContext(ctx);
@@ -130,9 +129,8 @@ bool quantum_platform::supports_conditional_feedback(
 }
 
 void quantum_platform::launchVQE(const std::string kernelName,
-                                 const void *kernelArgs,
-                                 cudaq::gradient *gradient, cudaq::spin_op H,
-                                 cudaq::optimizer &optimizer,
+                                 const void *kernelArgs, gradient *gradient,
+                                 spin_op H, optimizer &optimizer,
                                  const int n_params, const std::size_t shots) {
   std::size_t qpu_id = 0;
 
@@ -151,11 +149,10 @@ quantum_platform::get_remote_capabilities(const std::size_t qpu_id) const {
   return platformQPUs[qpu_id]->getRemoteCapabilities();
 }
 
-void quantum_platform::launchKernel(std::string kernelName,
-                                    void (*kernelFunc)(void *), void *args,
-                                    std::uint64_t voidStarSize,
-                                    std::uint64_t resultOffset,
-                                    const std::vector<void *> &rawArgs) {
+KernelThunkResultType quantum_platform::launchKernel(
+    std::string kernelName, KernelThunkType kernelFunc, void *args,
+    std::uint64_t voidStarSize, std::uint64_t resultOffset,
+    const std::vector<void *> &rawArgs) {
   std::size_t qpu_id = 0;
 
   auto tid = std::hash<std::thread::id>{}(std::this_thread::get_id());
@@ -164,8 +161,8 @@ void quantum_platform::launchKernel(std::string kernelName,
     qpu_id = iter->second;
 
   auto &qpu = platformQPUs[qpu_id];
-  qpu->launchKernel(kernelName, kernelFunc, args, voidStarSize, resultOffset,
-                    rawArgs);
+  return qpu->launchKernel(kernelName, kernelFunc, args, voidStarSize,
+                           resultOffset, rawArgs);
 }
 
 void quantum_platform::launchKernel(std::string kernelName,
@@ -183,7 +180,7 @@ void quantum_platform::launchKernel(std::string kernelName,
 
 void quantum_platform::launchSerializedCodeExecution(
     const std::string &name,
-    cudaq::SerializedCodeExecutionContext &serializeCodeExecutionObject) {
+    SerializedCodeExecutionContext &serializeCodeExecutionObject) {
   std::size_t qpu_id = 0;
 
   auto tid = std::hash<std::thread::id>{}(std::this_thread::get_id());
@@ -208,37 +205,46 @@ std::ostream *quantum_platform::getLogStream() { return platformLogStream; }
 void quantum_platform::setLogStream(std::ostream &logStream) {
   platformLogStream = &logStream;
 }
-} // namespace cudaq
 
-void cudaq::altLaunchKernel(const char *kernelName, void (*kernelFunc)(void *),
-                            void *kernelArgs, std::uint64_t argsSize,
-                            std::uint64_t resultOffset) {
+KernelThunkResultType altLaunchKernel(const char *kernelName,
+                                      KernelThunkType kernelFunc,
+                                      void *kernelArgs, std::uint64_t argsSize,
+                                      std::uint64_t resultOffset) {
   ScopedTraceWithContext("altLaunchKernel", kernelName, argsSize);
-  auto &platform = *cudaq::getQuantumPlatformInternal();
+  auto &platform = *getQuantumPlatformInternal();
   std::string kernName = kernelName;
-  platform.launchKernel(kernName, kernelFunc, kernelArgs, argsSize,
-                        resultOffset, {});
+  return platform.launchKernel(kernName, kernelFunc, kernelArgs, argsSize,
+                               resultOffset, {});
 }
 
-void cudaq::streamlinedLaunchKernel(const char *kernelName,
-                                    const std::vector<void *> &rawArgs) {
+KernelThunkResultType
+streamlinedLaunchKernel(const char *kernelName,
+                        const std::vector<void *> &rawArgs) {
   std::size_t argsSize = rawArgs.size();
   ScopedTraceWithContext("streamlinedLaunchKernel", kernelName, argsSize);
-  auto &platform = *cudaq::getQuantumPlatformInternal();
+  auto &platform = *getQuantumPlatformInternal();
   std::string kernName = kernelName;
   platform.launchKernel(kernName, rawArgs);
+  // NB: The streamlined launch will never return results. Use alt or hybrid if
+  // the kernel returns results.
+  return {};
 }
 
-void cudaq::hybridLaunchKernel(const char *kernelName, void (*kernel)(void *),
-                               void *args, std::uint64_t argsSize,
-                               std::uint64_t resultOffset,
-                               const std::vector<void *> &rawArgs) {
+KernelThunkResultType hybridLaunchKernel(const char *kernelName,
+                                         KernelThunkType kernel, void *args,
+                                         std::uint64_t argsSize,
+                                         std::uint64_t resultOffset,
+                                         const std::vector<void *> &rawArgs) {
   ScopedTraceWithContext("hybridLaunchKernel", kernelName);
-  auto &platform = *cudaq::getQuantumPlatformInternal();
+  auto &platform = *getQuantumPlatformInternal();
   const std::string kernName = kernelName;
-  if (platform.is_remote(platform.get_current_qpu()))
+  if (platform.is_remote(platform.get_current_qpu())) {
+    // This path should never call a kernel that returns results.
     platform.launchKernel(kernName, rawArgs);
-  else
-    platform.launchKernel(kernName, kernel, args, argsSize, resultOffset,
-                          rawArgs);
+    return {};
+  }
+  return platform.launchKernel(kernName, kernel, args, argsSize, resultOffset,
+                               rawArgs);
 }
+
+} // namespace cudaq
diff --git a/runtime/cudaq/platform/quantum_platform.h b/runtime/cudaq/platform/quantum_platform.h
index e9598bf051..e16071890a 100644
--- a/runtime/cudaq/platform/quantum_platform.h
+++ b/runtime/cudaq/platform/quantum_platform.h
@@ -11,6 +11,7 @@
 #include "common/ExecutionContext.h"
 #include "common/NoiseModel.h"
 #include "common/ObserveResult.h"
+#include "common/ThunkInterface.h"
 #include "cudaq/remote_capabilities.h"
 #include "cudaq/utils/cudaq_utils.h"
 #include <cstring>
@@ -142,10 +143,10 @@ class quantum_platform {
 
   // This method is the hook for the kernel rewrites to invoke
   // quantum kernels.
-  void launchKernel(std::string kernelName, void (*kernelFunc)(void *),
-                    void *args, std::uint64_t voidStarSize,
-                    std::uint64_t resultOffset,
-                    const std::vector<void *> &rawArgs);
+  [[nodiscard]] KernelThunkResultType
+  launchKernel(std::string kernelName, KernelThunkType kernelFunc, void *args,
+               std::uint64_t voidStarSize, std::uint64_t resultOffset,
+               const std::vector<void *> &rawArgs);
   void launchKernel(std::string kernelName, const std::vector<void *> &);
 
   // This method is the hook for executing SerializedCodeExecutionContext
@@ -215,19 +216,21 @@ class quantum_platform {
 /// provide that information.
 extern "C" {
 // Client-server (legacy) interface.
-void altLaunchKernel(const char *kernelName, void (*kernel)(void *), void *args,
-                     std::uint64_t argsSize, std::uint64_t resultOffset);
+[[nodiscard]] KernelThunkResultType
+altLaunchKernel(const char *kernelName, KernelThunkType kernel, void *args,
+                std::uint64_t argsSize, std::uint64_t resultOffset);
 // Streamlined interface for launching kernels. Argument synthesis and JIT
 // compilation *must* happen on the local machine.
-void streamlinedLaunchKernel(const char *kernelName,
-                             const std::vector<void *> &rawArgs);
+[[nodiscard]] KernelThunkResultType
+streamlinedLaunchKernel(const char *kernelName,
+                        const std::vector<void *> &rawArgs);
 // Hybrid of the client-server and streamlined approaches. Letting JIT
 // compilation happen either early or late and can handle return values from
 // each kernel launch.
-void hybridLaunchKernel(const char *kernelName, void (*kernel)(void *),
-                        void *args, std::uint64_t argsSize,
-                        std::uint64_t resultOffset,
-                        const std::vector<void *> &rawArgs);
+[[nodiscard]] KernelThunkResultType
+hybridLaunchKernel(const char *kernelName, KernelThunkType kernel, void *args,
+                   std::uint64_t argsSize, std::uint64_t resultOffset,
+                   const std::vector<void *> &rawArgs);
 }
 
 } // namespace cudaq
diff --git a/runtime/cudaq/qis/remote_state.cpp b/runtime/cudaq/qis/remote_state.cpp
index 713a462e46..97f907027a 100644
--- a/runtime/cudaq/qis/remote_state.cpp
+++ b/runtime/cudaq/qis/remote_state.cpp
@@ -184,7 +184,8 @@ RemoteSimulationState::overlap(const cudaq::SimulationState &other) {
       std::make_pair(static_cast<const cudaq::SimulationState *>(this),
                      static_cast<const cudaq::SimulationState *>(&otherState));
   platform.set_exec_ctx(&context);
-  platform.launchKernel(kernelName, nullptr, nullptr, 0, 0, {});
+  [[maybe_unused]] auto dynamicResult =
+      platform.launchKernel(kernelName, nullptr, nullptr, 0, 0, {});
   platform.reset_exec_ctx();
   assert(context.overlapResult.has_value());
   return context.overlapResult.value();
diff --git a/targettests/execution/vector_result.cpp b/targettests/execution/vector_result.cpp
new file mode 100644
index 0000000000..ecc09fce5c
--- /dev/null
+++ b/targettests/execution/vector_result.cpp
@@ -0,0 +1,52 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t
+
+#include "cudaq.h"
+#include <cstdio>
+
+struct VectorBoolResult {
+  std::vector<bool> operator()() __qpu__ {
+    std::vector<bool> result(3);
+    result[0] = true;
+    result[1] = false;
+    result[2] = true;
+    return result;
+  }
+};
+
+struct VectorIntResult {
+  std::vector<int> operator()() __qpu__ {
+    std::vector<int> result(2);
+    result[0] = 42;
+    result[1] = -23479;
+    return result;
+  }
+};
+
+struct VectorDoubleResult {
+  std::vector<double> operator()() __qpu__ {
+    std::vector<double> result(2);
+    result[0] = 543.0;
+    result[1] = -234234.0;
+    return result;
+  }
+};
+
+int main() {
+  auto retb{VectorBoolResult{}()};
+  printf("%d %d %d\n", static_cast<int>(retb[0]), static_cast<int>(retb[1]),
+         static_cast<int>(retb[2]));
+  auto ret = VectorIntResult{}();
+  printf("%d %d\n", ret[0], ret[1]);
+  std::vector<double> retd{VectorDoubleResult{}()};
+  printf("%f %f\n", retd[0], retd[1]);
+  return !(retb[0] && !retb[1] && retb[2] && ret[0] == 42 && ret[1] == -23479 &&
+           retd[0] == 543.0 && retd[1] == -234234.0);
+}
diff --git a/test/Quake-QIR/argument.qke b/test/Quake-QIR/argument.qke
index 61d737d5ce..6a3532805a 100644
--- a/test/Quake-QIR/argument.qke
+++ b/test/Quake-QIR/argument.qke
@@ -55,7 +55,7 @@ func.func @test_0(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.ptr<!cc.struct<{i
 // CHECK:         %[[VAL_12:.*]] = getelementptr i8, i8* %[[VAL_10]], i64 8
 // CHECK:         %[[VAL_13:.*]] = bitcast { i32, double }* %[[VAL_5]] to i8*
 // CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[VAL_12]], i8* align 1 %[[VAL_13]], i64 %[[VAL_8]], i1 false)
-// CHECK:         call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_0.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_0.thunk to i8*), i8* nonnull %[[VAL_10]], i64 %[[VAL_9]], i64 2147483647)
+// CHECK:         call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_0.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_0.thunk to i8*), i8* nonnull %[[VAL_10]], i64 %[[VAL_9]], i64 2147483647)
 // CHECK:         ret void
 // CHECK:       }
 
@@ -122,7 +122,7 @@ func.func @test_1(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.struct<{!cc.ptr<i
 // CHECK:         %[[VAL_24:.*]] = getelementptr i8, i8* %[[VAL_22]], i64 %[[VAL_8]]
 // CHECK:         %[[VAL_25:.*]] = bitcast float* %[[VAL_12]] to i8*
 // CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_24]], i8* align 1 %[[VAL_25]], i64 %[[VAL_15]], i1 false)
-// CHECK:         call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_1.thunk to i8*), i8* nonnull %[[VAL_18]], i64 %[[VAL_17]], i64 2147483647)
+// CHECK:         call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_1.thunk to i8*), i8* nonnull %[[VAL_18]], i64 %[[VAL_17]], i64 2147483647)
 // CHECK:         ret void
 // CHECK:       }
 
@@ -163,7 +163,7 @@ func.func @test_2(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.ptr<!cc.struct<{i
 // CHECK:         %[[VAL_12:.*]] = getelementptr i8, i8* %[[VAL_10]], i64 8
 // CHECK:         %[[VAL_13:.*]] = bitcast { i32, double }* %[[VAL_5]] to i8*
 // CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[VAL_12]], i8* align 1 %[[VAL_13]], i64 %[[VAL_8]], i1 false)
-// CHECK:         call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_2.thunk to i8*), i8* nonnull %[[VAL_10]], i64 %[[VAL_9]], i64 2147483647)
+// CHECK:         call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_2.thunk to i8*), i8* nonnull %[[VAL_10]], i64 %[[VAL_9]], i64 2147483647)
 // CHECK:         ret void
 // CHECK:       }
 
@@ -231,7 +231,7 @@ func.func @test_3(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.struct<{!cc.ptr<i
 // CHECK:         %[[VAL_24:.*]] = getelementptr i8, i8* %[[VAL_22]], i64 %[[VAL_8]]
 // CHECK:         %[[VAL_25:.*]] = bitcast float* %[[VAL_12]] to i8*
 // CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_24]], i8* align 1 %[[VAL_25]], i64 %[[VAL_15]], i1 false)
-// CHECK:         call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_3.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_3.thunk to i8*), i8* nonnull %[[VAL_18]], i64 %[[VAL_17]], i64 2147483647)
+// CHECK:         call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_3.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_3.thunk to i8*), i8* nonnull %[[VAL_18]], i64 %[[VAL_17]], i64 2147483647)
 // CHECK:         ret void
 // CHECK:       }
 
diff --git a/test/Quake-QIR/return_values.qke b/test/Quake-QIR/return_values.qke
index 085b9fec97..b78efb09ec 100644
--- a/test/Quake-QIR/return_values.qke
+++ b/test/Quake-QIR/return_values.qke
@@ -110,14 +110,23 @@ func.func @test_0(%1: !cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}
 // CHECK:         %[[VAL_4:.*]] = bitcast { i32, { i1*, i64 } }* %[[VAL_3]] to i8*
 // CHECK:         %[[VAL_5:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 0
 // CHECK:         store i32 %[[VAL_2]], i32* %[[VAL_5]], align 8
-// CHECK:         call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_0.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_0.thunk to i8*), i8* nonnull %[[VAL_4]], i64 24, i64 8)
-// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1, i32 0
-// CHECK:         %[[VAL_7:.*]] = bitcast i1** %[[VAL_6]] to i8**
-// CHECK:         %[[VAL_8:.*]] = load i8*, i8** %[[VAL_7]], align 8
-// CHECK:         %[[VAL_9:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1, i32 1
-// CHECK:         %[[VAL_10:.*]] = load i64, i64* %[[VAL_9]], align 8
-// CHECK:         %[[VAL_11:.*]] = bitcast { i8*, i8*, i8* }* %[[VAL_0]] to i8*
-// CHECK:         call void @__nvqpp_initializer_list_to_vector_bool(i8* %[[VAL_11]], i8* %[[VAL_8]], i64 %[[VAL_10]])
+// CHECK:         %[[VAL_6:.*]] = call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_0.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_0.thunk to i8*), i8* nonnull %[[VAL_4]], i64 24, i64 8)
+// CHECK:         %[[VAL_7:.*]] = extractvalue { i8*, i64 } %[[VAL_6]], 0
+// CHECK:         %[[VAL_8:.*]] = icmp eq i8* %[[VAL_7]], null
+// CHECK:         %[[VAL_9:.*]] = getelementptr i8, i8* %[[VAL_7]], i64 8
+// CHECK:         %[[VAL_10:.*]] = bitcast i8* %[[VAL_9]] to { i1*, i64 }*
+// CHECK:         %[[VAL_11:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1
+// CHECK:         %[[VAL_12:.*]] = select i1 %[[VAL_8]], { i1*, i64 }* %[[VAL_11]], { i1*, i64 }* %[[VAL_10]]
+// CHECK:         %[[VAL_13:.*]] = bitcast { i1*, i64 }* %[[VAL_12]] to i8**
+// CHECK:         %[[VAL_14:.*]] = load i8*, i8** %[[VAL_13]], align 8
+// CHECK:         %[[VAL_15:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1, i32 1
+// CHECK:         %[[VAL_16:.*]] = getelementptr i8, i8* %[[VAL_7]], i64 16
+// CHECK:         %[[VAL_17:.*]] = bitcast i8* %[[VAL_16]] to i64*
+// CHECK:         %[[VAL_18:.*]] = select i1 %[[VAL_8]], i64* %[[VAL_15]], i64* %[[VAL_17]]
+// CHECK:         %[[VAL_19:.*]] = load i64, i64* %[[VAL_18]], align 4
+// CHECK:         %[[VAL_20:.*]] = bitcast { i8*, i8*, i8* }* %[[VAL_0]] to i8*
+// CHECK:         call void @__nvqpp_initializer_list_to_vector_bool(i8* %[[VAL_20]], i8* %[[VAL_14]], i64 %[[VAL_19]])
+// CHECK:         call void @free(i8* %[[VAL_7]])
 // CHECK:         ret void
 // CHECK:       }
 
@@ -169,7 +178,7 @@ func.func @test_1(%this: !cc.ptr<i8>) -> i16 {
 // CHECK-SAME:    %[[VAL_0:.*]]) {{.*}}{
 // CHECK-NEXT:    %[[VAL_2:.*]] = alloca i16, align 8
 // CHECK:         %[[VAL_3:.*]] = bitcast i16* %[[VAL_2]] to i8*
-// CHECK:         call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_1.thunk to i8*), i8* nonnull %[[VAL_3]], i64 2, i64 0)
+// CHECK:         call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_1.thunk to i8*), i8* nonnull %[[VAL_3]], i64 2, i64 0)
 // CHECK:         %[[VAL_4:.*]] = load i16, i16* %[[VAL_2]], align 8
 // CHECK:         ret i16 %[[VAL_4]]
 // CHECK:       }
@@ -200,7 +209,7 @@ func.func @test_2(%1: !cc.ptr<!cc.struct<{i16, f32, f64, i64}>> {llvm.sret = !cc
 // CHECK-SAME:          %[[VAL_0:.*]], i8* nocapture readnone %[[VAL_1:.*]]) {{.*}}{
 // CHECK:         %[[VAL_2:.*]] = alloca { { i16, float, double, i64 } }, align 8
 // CHECK:         %[[VAL_3:.*]] = bitcast { { i16, float, double, i64 } }* %[[VAL_2]] to i8*
-// CHECK:         call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_2.thunk to i8*), i8* nonnull %[[VAL_3]], i64 24, i64 0)
+// CHECK:         call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_2.thunk to i8*), i8* nonnull %[[VAL_3]], i64 24, i64 0)
 // CHECK:         %[[VAL_4:.*]] = bitcast { i16, float, double, i64 }* %[[VAL_0]] to i8*
 // CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(24) %[[VAL_4]], i8* noundef nonnull align 8 dereferenceable(24) %[[VAL_3]], i64 24, i1 false)
 // CHECK:         ret void
@@ -234,7 +243,7 @@ func.func @test_3(%1: !cc.ptr<!cc.array<i64 x 5>> {llvm.sret = !cc.array<i64 x 5
 // CHECK-SAME:      %[[VAL_0:.*]], i8* nocapture readnone %[[VAL_1:.*]]) {{.*}}{
 // CHECK:         %[[VAL_2:.*]] = alloca { [5 x i64] }, align 8
 // CHECK:         %[[VAL_3:.*]] = bitcast { [5 x i64] }* %[[VAL_2]] to i8*
-// CHECK:         call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_3.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_3.thunk to i8*), i8* nonnull %[[VAL_3]], i64 40, i64 0)
+// CHECK:         call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_3.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_3.thunk to i8*), i8* nonnull %[[VAL_3]], i64 40, i64 0)
 // CHECK:         %[[VAL_4:.*]] = bitcast [5 x i64]* %[[VAL_0]] to i8*
 // CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(40) %[[VAL_4]], i8* noundef nonnull align 8 dereferenceable(40) %[[VAL_3]], i64 40, i1 false)
 // CHECK:         ret void
@@ -259,7 +268,7 @@ func.func @test_4(%sret: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct
 // CHECK-SAME:     %[[VAL_0:.*]], i8* nocapture readnone %[[VAL_1:.*]]) {{.*}}{
 // CHECK:         %[[VAL_2:.*]] = alloca { i64, double }, align 8
 // CHECK:         %[[VAL_3:.*]] = bitcast { i64, double }* %[[VAL_2]] to i8*
-// CHECK:         call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_4.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_4.thunk to i8*), i8* nonnull %[[VAL_3]], i64 16, i64 0)
+// CHECK:         call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_4.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_4.thunk to i8*), i8* nonnull %[[VAL_3]], i64 16, i64 0)
 // CHECK:         %[[VAL_4:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8*
 // CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_4]], i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_3]], i64 16, i1 false)
 // CHECK:         ret void
@@ -283,7 +292,7 @@ func.func @test_5(%sret: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct
 // CHECK-SAME:       %[[VAL_0:.*]]) {{.*}}{
 // CHECK:         %[[VAL_1:.*]] = alloca { i64, double }, align 8
 // CHECK:         %[[VAL_2:.*]] = bitcast { i64, double }* %[[VAL_1]] to i8*
-// CHECK:         call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_5.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_5.thunk to i8*), i8* nonnull %[[VAL_2]], i64 16, i64 0)
+// CHECK:         call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_5.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_5.thunk to i8*), i8* nonnull %[[VAL_2]], i64 16, i64 0)
 // CHECK:         %[[VAL_3:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8*
 // CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_3]], i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_2]], i64 16, i1 false)
 // CHECK:         ret void
diff --git a/test/Quake/kernel_exec-1.qke b/test/Quake/kernel_exec-1.qke
index 751ba66a10..37ac7c7229 100644
--- a/test/Quake/kernel_exec-1.qke
+++ b/test/Quake/kernel_exec-1.qke
@@ -90,25 +90,38 @@ module attributes {quake.mangled_name_map = {
 // CHECK-DAG:       %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}>
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : i64
 // CHECK:           %[[VAL_4:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{i32, f64}>, i32) -> !cc.struct<{i32, f64}>
-// CHECK:           %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64
-// CHECK:           %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_3]] : i64
-// CHECK:           %[[VAL_9:.*]] = cc.alloca i8[%[[VAL_8]] : i64]
-// CHECK:           %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, f64}>>
-// CHECK:           cc.store %[[VAL_4]], %[[VAL_10]] : !cc.ptr<!cc.struct<{i32, f64}>>
-// CHECK:           %[[VAL_11:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>
-// CHECK:           %[[VAL_13:.*]] = constant @ghz.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_15:.*]] = cc.func_ptr %[[VAL_13]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_16:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_18:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64
-// CHECK:           %[[VAL_12:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
-// CHECK:           %[[VAL_14:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
-// CHECK:           call @altLaunchKernel(%[[VAL_14]], %[[VAL_15]], %[[VAL_16]], %[[VAL_8]], %[[VAL_18]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ()
-// CHECK:           %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_11]][0, 1] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<f64>
-// CHECK:           %[[VAL_20:.*]] = cc.load %[[VAL_19]] : !cc.ptr<f64>
-// CHECK:           return %[[VAL_20]] : f64
+// CHECK:           %[[VAL_5:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64
+// CHECK:           %[[VAL_6:.*]] = arith.addi %[[VAL_5]], %[[VAL_3]] : i64
+// CHECK:           %[[VAL_7:.*]] = cc.alloca i8{{\[}}%[[VAL_6]] : i64]
+// CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, f64}>>
+// CHECK:           cc.store %[[VAL_4]], %[[VAL_8]] : !cc.ptr<!cc.struct<{i32, f64}>>
+// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>
+// CHECK:           %[[VAL_10:.*]] = constant @ghz.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_11:.*]] = cc.func_ptr %[[VAL_10]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64
+// CHECK:           %[[VAL_14:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
+// CHECK:           %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_16:.*]] = call @altLaunchKernel(%[[VAL_15]], %[[VAL_11]], %[[VAL_12]], %[[VAL_6]], %[[VAL_13]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_17:.*]] = cc.extract_value %[[VAL_16]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<i8>) -> i64
+// CHECK:           %[[VAL_19:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_20:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_19]] : i64
+// CHECK:           cf.cond_br %[[VAL_20]], ^bb1, ^bb2
+// CHECK:         ^bb1:
+// CHECK:           %[[VAL_21:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, f64}>>
+// CHECK:           %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_21]][1] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<f64>
+// CHECK:           cf.br ^bb3(%[[VAL_22]] : !cc.ptr<f64>)
+// CHECK:         ^bb2:
+// CHECK:           %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<f64>
+// CHECK:           cf.br ^bb3(%[[VAL_23]] : !cc.ptr<f64>)
+// CHECK:         ^bb3(%[[VAL_24:.*]]: !cc.ptr<f64>):
+// CHECK:           %[[VAL_25:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<f64>
+// CHECK:           %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr<f64>
+// CHECK:           return %[[VAL_26]] : f64
 // CHECK:         }
 
-// CHECK:         func.func private @altLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64)
+// CHECK:         func.func private @altLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:         func.func private @cudaqRegisterKernelName(!cc.ptr<i8>)
 
 // CHECK:         llvm.mlir.global external constant @ghz.kernelName("ghz\00") {addr_space = 0 : i32}
@@ -192,39 +205,52 @@ module attributes {quake.mangled_name_map = {
 // HYBRID:           %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}>
 // HYBRID:           %[[VAL_3:.*]] = arith.constant 0 : i64
 // HYBRID:           %[[VAL_4:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{i32, f64}>, i32) -> !cc.struct<{i32, f64}>
-// HYBRID:           %[[VAL_6:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64
-// HYBRID:           %[[VAL_7:.*]] = arith.addi %[[VAL_6]], %[[VAL_3]] : i64
-// HYBRID:           %[[VAL_8:.*]] = cc.alloca i8{{\[}}%[[VAL_7]] : i64]
-// HYBRID:           %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, f64}>>
-// HYBRID:           cc.store %[[VAL_4]], %[[VAL_9]] : !cc.ptr<!cc.struct<{i32, f64}>>
-// HYBRID:           %[[VAL_10:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>
-// HYBRID:           %[[VAL_11:.*]] = constant @ghz.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// HYBRID:           %[[VAL_12:.*]] = cc.func_ptr %[[VAL_11]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
-// HYBRID:           %[[VAL_13:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<i8>
-// HYBRID:           %[[VAL_15:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64
-// HYBRID:           %[[VAL_16:.*]] = cc.alloca !cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>
-// HYBRID:           %[[VAL_17:.*]] = cc.alloca !cc.array<!cc.ptr<i8> x 1>
-// HYBRID:           %[[VAL_18:.*]] = cc.sizeof !cc.array<!cc.ptr<i8> x 1> : i64
-// HYBRID:           %[[VAL_19:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
-// HYBRID:           %[[VAL_20:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// HYBRID:           cc.store %[[VAL_19]], %[[VAL_20]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// HYBRID:           %[[VAL_21:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> i64
-// HYBRID:           %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_18]] : i64
-// HYBRID:           %[[VAL_23:.*]] = cc.cast %[[VAL_22]] : (i64) -> !cc.ptr<!cc.ptr<i8>>
-// HYBRID:           %[[VAL_24:.*]] = cc.compute_ptr %[[VAL_16]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// HYBRID:           cc.store %[[VAL_23]], %[[VAL_24]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// HYBRID:           %[[VAL_25:.*]] = cc.compute_ptr %[[VAL_16]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// HYBRID:           cc.store %[[VAL_23]], %[[VAL_25]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// HYBRID:           %[[VAL_26:.*]] = cc.compute_ptr %[[VAL_17]][0] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
-// HYBRID:           %[[VAL_27:.*]] = cc.alloca i32
-// HYBRID:           cc.store %[[VAL_1]], %[[VAL_27]] : !cc.ptr<i32>
-// HYBRID:           %[[VAL_28:.*]] = cc.cast %[[VAL_27]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
-// HYBRID:           cc.store %[[VAL_28]], %[[VAL_26]] : !cc.ptr<!cc.ptr<i8>>
-// HYBRID:           %[[VAL_29:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<i8>
-// HYBRID:           %[[VAL_30:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
-// HYBRID:           %[[VAL_31:.*]] = cc.cast %[[VAL_30]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
-// HYBRID:           call @hybridLaunchKernel(%[[VAL_31]], %[[VAL_12]], %[[VAL_13]], %[[VAL_7]], %[[VAL_15]], %[[VAL_29]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> ()
-// HYBRID:           %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_10]][0, 1] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<f64>
-// HYBRID:           %[[VAL_33:.*]] = cc.load %[[VAL_32]] : !cc.ptr<f64>
-// HYBRID:           return %[[VAL_33]] : f64
+// HYBRID:           %[[VAL_5:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64
+// HYBRID:           %[[VAL_6:.*]] = arith.addi %[[VAL_5]], %[[VAL_3]] : i64
+// HYBRID:           %[[VAL_7:.*]] = cc.alloca i8{{\[}}%[[VAL_6]] : i64]
+// HYBRID:           %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, f64}>>
+// HYBRID:           cc.store %[[VAL_4]], %[[VAL_8]] : !cc.ptr<!cc.struct<{i32, f64}>>
+// HYBRID:           %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>
+// HYBRID:           %[[VAL_10:.*]] = constant @ghz.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// HYBRID:           %[[VAL_11:.*]] = cc.func_ptr %[[VAL_10]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64
+// HYBRID:           %[[VAL_14:.*]] = cc.alloca !cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>
+// HYBRID:           %[[VAL_15:.*]] = cc.alloca !cc.array<!cc.ptr<i8> x 1>
+// HYBRID:           %[[VAL_16:.*]] = cc.sizeof !cc.array<!cc.ptr<i8> x 1> : i64
+// HYBRID:           %[[VAL_17:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           %[[VAL_18:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           cc.store %[[VAL_17]], %[[VAL_18]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           %[[VAL_19:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> i64
+// HYBRID:           %[[VAL_20:.*]] = arith.addi %[[VAL_19]], %[[VAL_16]] : i64
+// HYBRID:           %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (i64) -> !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_14]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           cc.store %[[VAL_21]], %[[VAL_22]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_14]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           cc.store %[[VAL_21]], %[[VAL_23]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           %[[VAL_24:.*]] = cc.compute_ptr %[[VAL_15]][0] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           %[[VAL_25:.*]] = cc.alloca i32
+// HYBRID:           cc.store %[[VAL_1]], %[[VAL_25]] : !cc.ptr<i32>
+// HYBRID:           %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
+// HYBRID:           cc.store %[[VAL_26]], %[[VAL_24]] : !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           %[[VAL_27:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_28:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
+// HYBRID:           %[[VAL_29:.*]] = cc.cast %[[VAL_28]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_30:.*]] = call @hybridLaunchKernel(%[[VAL_29]], %[[VAL_11]], %[[VAL_12]], %[[VAL_6]], %[[VAL_13]], %[[VAL_27]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// HYBRID:           %[[VAL_31:.*]] = cc.extract_value %[[VAL_30]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_32:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr<i8>) -> i64
+// HYBRID:           %[[VAL_33:.*]] = arith.constant 0 : i64
+// HYBRID:           %[[VAL_34:.*]] = arith.cmpi ne, %[[VAL_32]], %[[VAL_33]] : i64
+// HYBRID:           cf.cond_br %[[VAL_34]], ^bb1, ^bb2
+// HYBRID:         ^bb1:
+// HYBRID:           %[[VAL_35:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, f64}>>
+// HYBRID:           %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_35]][1] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<f64>
+// HYBRID:           cf.br ^bb3(%[[VAL_36]] : !cc.ptr<f64>)
+// HYBRID:         ^bb2:
+// HYBRID:           %[[VAL_37:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<f64>
+// HYBRID:           cf.br ^bb3(%[[VAL_37]] : !cc.ptr<f64>)
+// HYBRID:         ^bb3(%[[VAL_38:.*]]: !cc.ptr<f64>):
+// HYBRID:           %[[VAL_39:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<f64>
+// HYBRID:           %[[VAL_40:.*]] = cc.load %[[VAL_39]] : !cc.ptr<f64>
+// HYBRID:           return %[[VAL_40]] : f64
 // HYBRID:         }
diff --git a/test/Quake/kernel_exec-2.qke b/test/Quake/kernel_exec-2.qke
index 4e53513774..a9b04b8449 100644
--- a/test/Quake/kernel_exec-2.qke
+++ b/test/Quake/kernel_exec-2.qke
@@ -71,11 +71,11 @@ __nvqpp__mlirgen__function_cargo = "pants"}} {
 // CHECK:           %[[VAL_33:.*]] = arith.constant 2147483647 : i64
 // CHECK:           %[[VAL_28:.*]] = llvm.mlir.addressof @function_hawaiian.kernelName : !llvm.ptr<array<18 x i8>>
 // CHECK:           %[[VAL_30:.*]] = cc.cast %[[VAL_28]] : (!llvm.ptr<array<18 x i8>>) -> !cc.ptr<i8>
-// CHECK:           call @altLaunchKernel(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_17]], %[[VAL_33]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ()
+// CHECK:           call @altLaunchKernel(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_17]], %[[VAL_33]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:           return
 // CHECK:         }
 
-// CHECK-DAG:     func.func private @altLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64)
+// CHECK-DAG:     func.func private @altLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK-DAG:     func.func private @cudaqRegisterKernelName(!cc.ptr<i8>)
 // CHECK-DAG:     func.func private @cudaqRegisterArgsCreator(!cc.ptr<i8>, !cc.ptr<i8>)
 // CHECK-DAG:     func.func private @malloc(i64) -> !cc.ptr<i8>
diff --git a/test/Quake/return_vector.qke b/test/Quake/return_vector.qke
index a13d0b6abe..90ccc90610 100644
--- a/test/Quake/return_vector.qke
+++ b/test/Quake/return_vector.qke
@@ -6,8 +6,8 @@
 // the terms of the Apache License 2.0 which accompanies this distribution.   //
 // ========================================================================== //
 
-// RUN: cudaq-opt --add-dealloc --kernel-execution=codegen=1 --canonicalize %s | \
-// RUN: FileCheck %s
+// RUN: cudaq-opt --add-dealloc --kernel-execution=codegen=1 --canonicalize %s \
+// RUN: | FileCheck %s
 
 // NB: the mangled name map is required for the kernel-execution pass.
 module attributes{ quake.mangled_name_map = {
@@ -42,37 +42,48 @@ func.func @test_0(%0: !cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i3
 // CHECK-SAME:          %[[VAL_1:.*]]: !cc.ptr<i8>, %[[VAL_2:.*]]: i32) {
 // CHECK:           %[[VAL_3:.*]] = arith.constant 4 : i64
 // CHECK:           %[[VAL_4:.*]] = constant @test_0.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i64
 // CHECK:           %[[VAL_6:.*]] = cc.undef !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>
 // CHECK:           %[[VAL_7:.*]] = cc.insert_value %[[VAL_2]], %[[VAL_6]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>, i32) -> !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>
-// CHECK:           %[[VAL_9:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> : i64
-// CHECK:           %[[VAL_10:.*]] = cc.alloca i8{{\[}}%[[VAL_9]] : i64]
-// CHECK:           %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
-// CHECK:           cc.store %[[VAL_7]], %[[VAL_11]] : !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
-// CHECK:           %[[VAL_14:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_15:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_17:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> [1] : i64
-// CHECK:           %[[VAL_12:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr<array<7 x i8>>
-// CHECK:           %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
-// CHECK:           call @altLaunchKernel(%[[VAL_13]], %[[VAL_14]], %[[VAL_15]], %[[VAL_9]], %[[VAL_17]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ()
-// CHECK:           %[[VAL_18:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
-// CHECK:           %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_18]][1, 0] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>) -> !cc.ptr<!cc.ptr<i32>>
-// CHECK:           %[[VAL_20:.*]] = cc.load %[[VAL_19]] : !cc.ptr<!cc.ptr<i32>>
-// CHECK:           %[[VAL_21:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
-// CHECK:           %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_21]][1, 1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>) -> !cc.ptr<i64>
-// CHECK:           %[[VAL_23:.*]] = cc.load %[[VAL_22]] : !cc.ptr<i64>
-// CHECK:           %[[VAL_24:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>
-// CHECK:           %[[VAL_25:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr<!cc.ptr<i8>>
-// CHECK:           call @free(%[[VAL_26]]) : (!cc.ptr<i8>) -> ()
-// CHECK:           %[[VAL_27:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
-// CHECK:           cc.store %[[VAL_27]], %[[VAL_25]] : !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_28:.*]] = cc.compute_ptr %[[VAL_24]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>) -> !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_29:.*]] = arith.muli %[[VAL_23]], %[[VAL_3]] : i64
-// CHECK:           %[[VAL_30:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr<i32>) -> !cc.ptr<!cc.array<i8 x ?>>
-// CHECK:           %[[VAL_31:.*]] = cc.compute_ptr %[[VAL_30]]{{\[}}%[[VAL_29]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:           cc.store %[[VAL_31]], %[[VAL_28]] : !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_24]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>) -> !cc.ptr<!cc.ptr<i8>>
-// CHECK:           cc.store %[[VAL_31]], %[[VAL_32]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_8:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> : i64
+// CHECK:           %[[VAL_9:.*]] = cc.alloca i8{{\[}}%[[VAL_8]] : i64]
+// CHECK:           %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
+// CHECK:           cc.store %[[VAL_7]], %[[VAL_10]] : !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
+// CHECK:           %[[VAL_11:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> [1] : i64
+// CHECK:           %[[VAL_14:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr<array<7 x i8>>
+// CHECK:           %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_16:.*]] = call @altLaunchKernel(%[[VAL_15]], %[[VAL_11]], %[[VAL_12]], %[[VAL_8]], %[[VAL_13]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_17:.*]] = cc.extract_value %[[VAL_16]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<i8>) -> i64
+// CHECK:           %[[VAL_19:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_5]] : i64
+// CHECK:           cf.cond_br %[[VAL_19]], ^bb1, ^bb2
+// CHECK:         ^bb1:
+// CHECK:           %[[VAL_20:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
+// CHECK:           %[[VAL_21:.*]] = cc.compute_ptr %[[VAL_20]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>
+// CHECK:           cf.br ^bb3(%[[VAL_21]] : !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>)
+// CHECK:         ^bb2:
+// CHECK:           %[[VAL_22:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
+// CHECK:           %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_22]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>
+// CHECK:           cf.br ^bb3(%[[VAL_23]] : !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>)
+// CHECK:         ^bb3(%[[VAL_24:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>):
+// CHECK:           %[[VAL_25:.*]] = cc.cast %[[VAL_24]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>) -> !cc.ptr<!cc.ptr<i32>>
+// CHECK:           %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr<!cc.ptr<i32>>
+// CHECK:           %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_24]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>) -> !cc.ptr<i64>
+// CHECK:           %[[VAL_28:.*]] = cc.load %[[VAL_27]] : !cc.ptr<i64>
+// CHECK:           %[[VAL_29:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>
+// CHECK:           %[[VAL_30:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_31:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
+// CHECK:           cc.store %[[VAL_31]], %[[VAL_30]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_29]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_33:.*]] = arith.muli %[[VAL_28]], %[[VAL_3]] : i64
+// CHECK:           %[[VAL_34:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr<i32>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:           %[[VAL_35:.*]] = cc.compute_ptr %[[VAL_34]]{{\[}}%[[VAL_33]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// CHECK:           cc.store %[[VAL_35]], %[[VAL_32]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_29]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           cc.store %[[VAL_35]], %[[VAL_36]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           call @free(%[[VAL_17]]) : (!cc.ptr<i8>) -> ()
 // CHECK:           return
 // CHECK:         }
 
@@ -102,37 +113,48 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 // CHECK-SAME:           %[[VAL_1:.*]]: !cc.ptr<i8>, %[[VAL_2:.*]]: i32) {
 // CHECK:           %[[VAL_3:.*]] = arith.constant 8 : i64
 // CHECK:           %[[VAL_4:.*]] = constant @test_1.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i64
 // CHECK:           %[[VAL_6:.*]] = cc.undef !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>
 // CHECK:           %[[VAL_7:.*]] = cc.insert_value %[[VAL_2]], %[[VAL_6]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>, i32) -> !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>
-// CHECK:           %[[VAL_9:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> : i64
-// CHECK:           %[[VAL_10:.*]] = cc.alloca i8{{\[}}%[[VAL_9]] : i64]
-// CHECK:           %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
-// CHECK:           cc.store %[[VAL_7]], %[[VAL_11]] : !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
-// CHECK:           %[[VAL_14:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_15:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_17:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> [1] : i64
-// CHECK:           %[[VAL_12:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr<array<7 x i8>>
-// CHECK:           %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
-// CHECK:           call @altLaunchKernel(%[[VAL_13]], %[[VAL_14]], %[[VAL_15]], %[[VAL_9]], %[[VAL_17]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ()
-// CHECK:           %[[VAL_18:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
-// CHECK:           %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_18]][1, 0] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>) -> !cc.ptr<!cc.ptr<f64>>
-// CHECK:           %[[VAL_20:.*]] = cc.load %[[VAL_19]] : !cc.ptr<!cc.ptr<f64>>
-// CHECK:           %[[VAL_21:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
-// CHECK:           %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_21]][1, 1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>) -> !cc.ptr<i64>
-// CHECK:           %[[VAL_23:.*]] = cc.load %[[VAL_22]] : !cc.ptr<i64>
-// CHECK:           %[[VAL_24:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f64>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>
-// CHECK:           %[[VAL_25:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f64>}>>) -> !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr<!cc.ptr<i8>>
-// CHECK:           call @free(%[[VAL_26]]) : (!cc.ptr<i8>) -> ()
-// CHECK:           %[[VAL_27:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr<f64>) -> !cc.ptr<i8>
-// CHECK:           cc.store %[[VAL_27]], %[[VAL_25]] : !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_28:.*]] = cc.compute_ptr %[[VAL_24]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>) -> !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_29:.*]] = arith.muli %[[VAL_23]], %[[VAL_3]] : i64
-// CHECK:           %[[VAL_30:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr<f64>) -> !cc.ptr<!cc.array<i8 x ?>>
-// CHECK:           %[[VAL_31:.*]] = cc.compute_ptr %[[VAL_30]]{{\[}}%[[VAL_29]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:           cc.store %[[VAL_31]], %[[VAL_28]] : !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_24]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>) -> !cc.ptr<!cc.ptr<i8>>
-// CHECK:           cc.store %[[VAL_31]], %[[VAL_32]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_8:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> : i64
+// CHECK:           %[[VAL_9:.*]] = cc.alloca i8{{\[}}%[[VAL_8]] : i64]
+// CHECK:           %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
+// CHECK:           cc.store %[[VAL_7]], %[[VAL_10]] : !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
+// CHECK:           %[[VAL_11:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> [1] : i64
+// CHECK:           %[[VAL_14:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr<array<7 x i8>>
+// CHECK:           %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_16:.*]] = call @altLaunchKernel(%[[VAL_15]], %[[VAL_11]], %[[VAL_12]], %[[VAL_8]], %[[VAL_13]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_17:.*]] = cc.extract_value %[[VAL_16]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<i8>) -> i64
+// CHECK:           %[[VAL_19:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_5]] : i64
+// CHECK:           cf.cond_br %[[VAL_19]], ^bb1, ^bb2
+// CHECK:         ^bb1:
+// CHECK:           %[[VAL_20:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
+// CHECK:           %[[VAL_21:.*]] = cc.compute_ptr %[[VAL_20]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>
+// CHECK:           cf.br ^bb3(%[[VAL_21]] : !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>)
+// CHECK:         ^bb2:
+// CHECK:           %[[VAL_22:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
+// CHECK:           %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_22]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>
+// CHECK:           cf.br ^bb3(%[[VAL_23]] : !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>)
+// CHECK:         ^bb3(%[[VAL_24:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>):
+// CHECK:           %[[VAL_25:.*]] = cc.cast %[[VAL_24]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>) -> !cc.ptr<!cc.ptr<f64>>
+// CHECK:           %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr<!cc.ptr<f64>>
+// CHECK:           %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_24]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>) -> !cc.ptr<i64>
+// CHECK:           %[[VAL_28:.*]] = cc.load %[[VAL_27]] : !cc.ptr<i64>
+// CHECK:           %[[VAL_29:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f64>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>
+// CHECK:           %[[VAL_30:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f64>}>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_31:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr<f64>) -> !cc.ptr<i8>
+// CHECK:           cc.store %[[VAL_31]], %[[VAL_30]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_29]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_33:.*]] = arith.muli %[[VAL_28]], %[[VAL_3]] : i64
+// CHECK:           %[[VAL_34:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr<f64>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:           %[[VAL_35:.*]] = cc.compute_ptr %[[VAL_34]]{{\[}}%[[VAL_33]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// CHECK:           cc.store %[[VAL_35]], %[[VAL_32]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_29]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           cc.store %[[VAL_35]], %[[VAL_36]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           call @free(%[[VAL_17]]) : (!cc.ptr<i8>) -> ()
 // CHECK:           return
 // CHECK:         }
 
@@ -140,40 +162,42 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 
 // CHECK-LABEL:   func.func @test_0.thunk(
 // CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr<i8>, i64}> {
-// CHECK:           %[[VAL_3:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
-// CHECK:           %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
-// CHECK:           %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> : i64
-// CHECK:           %[[VAL_10:.*]] = cc.extract_value %[[VAL_4]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>) -> i32
-// CHECK:           %[[VAL_15:.*]] = call @__nvqpp__mlirgen__test_0(%[[VAL_10]]) : (i32) -> !cc.stdvec<i32>
+// CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
+// CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
+// CHECK:           %[[VAL_4:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> : i64
+// CHECK:           %[[VAL_5:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>) -> i32
+// CHECK:           %[[VAL_6:.*]] = call @__nvqpp__mlirgen__test_0(%[[VAL_5]]) : (i32) -> !cc.stdvec<i32>
+// CHECK:           %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>
+// CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>) -> !cc.ptr<!cc.stdvec<i32>>
+// CHECK:           cc.store %[[VAL_6]], %[[VAL_8]] : !cc.ptr<!cc.stdvec<i32>>
 // CHECK:           cf.cond_br %[[VAL_1]], ^bb1, ^bb2
 // CHECK:         ^bb1:
-// CHECK:           %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_3]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>
-// CHECK:           %[[VAL_16:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>) -> !cc.ptr<!cc.stdvec<i32>>
-// CHECK:           cc.store %[[VAL_15]], %[[VAL_16]] : !cc.ptr<!cc.stdvec<i32>>
-// CHECK:           %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>
-// CHECK:           %[[VAL_13:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_7]], %[[VAL_12]]) : (!cc.ptr<i8>, i64, !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           return %[[VAL_13]] : !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>
+// CHECK:           %[[VAL_10:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> [1] : i64
+// CHECK:           %[[VAL_11:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_4]], %[[VAL_9]], %[[VAL_10]]) : (!cc.ptr<i8>, i64, !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           return %[[VAL_11]] : !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:         ^bb2:
-// CHECK:           %[[VAL_14:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           return %[[VAL_14]] : !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_12:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           return %[[VAL_12]] : !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @test_1.thunk(
 // CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr<i8>, i64}> {
-// CHECK:           %[[VAL_3:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
-// CHECK:           %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
-// CHECK:           %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> : i64
-// CHECK:           %[[VAL_10:.*]] = cc.extract_value %[[VAL_4]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>) -> i32
-// CHECK:           %[[VAL_15:.*]] = call @__nvqpp__mlirgen__test_1(%[[VAL_10]]) : (i32) -> !cc.stdvec<f64>
+// CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
+// CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
+// CHECK:           %[[VAL_4:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> : i64
+// CHECK:           %[[VAL_5:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>) -> i32
+// CHECK:           %[[VAL_6:.*]] = call @__nvqpp__mlirgen__test_1(%[[VAL_5]]) : (i32) -> !cc.stdvec<f64>
+// CHECK:           %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>
+// CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>) -> !cc.ptr<!cc.stdvec<f64>>
+// CHECK:           cc.store %[[VAL_6]], %[[VAL_8]] : !cc.ptr<!cc.stdvec<f64>>
 // CHECK:           cf.cond_br %[[VAL_1]], ^bb1, ^bb2
 // CHECK:         ^bb1:
-// CHECK:           %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_3]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>
-// CHECK:           %[[VAL_16:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>) -> !cc.ptr<!cc.stdvec<f64>>
-// CHECK:           cc.store %[[VAL_15]], %[[VAL_16]] : !cc.ptr<!cc.stdvec<f64>>
-// CHECK:           %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>
-// CHECK:           %[[VAL_13:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_7]], %[[VAL_12]]) : (!cc.ptr<i8>, i64, !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           return %[[VAL_13]] : !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>
+// CHECK:           %[[VAL_10:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> [1] : i64
+// CHECK:           %[[VAL_11:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_4]], %[[VAL_9]], %[[VAL_10]]) : (!cc.ptr<i8>, i64, !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           return %[[VAL_11]] : !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:         ^bb2:
-// CHECK:           %[[VAL_14:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           return %[[VAL_14]] : !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_12:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           return %[[VAL_12]] : !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:         }