diff --git a/docker/build/assets.Dockerfile b/docker/build/assets.Dockerfile
index f7d34e9268..73e20627ad 100644
--- a/docker/build/assets.Dockerfile
+++ b/docker/build/assets.Dockerfile
@@ -286,6 +286,7 @@ RUN cd /cuda-quantum && source scripts/configure_build.sh && \
         # The tests is marked correctly as requiring nvcc, but since nvcc
         # is available during the build we need to filter it manually.
         filtered=" --filter-out MixedLanguage/cuda-1"; \
+	filtered+="|AST-Quake/calling_convention"; \
     fi && \
     "$LLVM_INSTALL_PREFIX/bin/llvm-lit" -v build/test \
         --param nvqpp_site_config=build/test/lit.site.cfg.py ${filtered} && \
diff --git a/include/cudaq/Optimizer/Builder/Factory.h b/include/cudaq/Optimizer/Builder/Factory.h
index 868cf4c861..24e933117a 100644
--- a/include/cudaq/Optimizer/Builder/Factory.h
+++ b/include/cudaq/Optimizer/Builder/Factory.h
@@ -236,6 +236,10 @@ createMonotonicLoop(mlir::OpBuilder &builder, mlir::Location loc,
 
 bool hasHiddenSRet(mlir::FunctionType funcTy);
 
+/// Check a function to see if argument 0 has the `sret` attribute. Typically,
+/// one may find this on a host-side entry point function.
+bool hasSRet(mlir::func::FuncOp funcOp);
+
 /// Convert the function type \p funcTy to a signature compatible with the code
 /// on the host side. This will add hidden arguments, such as the `this`
 /// pointer, convert some results to `sret` pointers, etc.
@@ -251,7 +255,8 @@ bool isX86_64(mlir::ModuleOp);
 bool isAArch64(mlir::ModuleOp);
 
 /// A small structure may be passed as two arguments on the host side. (e.g., on
-/// the X86-64 ABI.) If \p ty is not a `struct`, this returns `false`.
+/// the X86-64 ABI.) If \p ty is not a `struct`, this returns `false`. Note
+/// also, some small structs may be packed into a single register.
 bool structUsesTwoArguments(mlir::Type ty);
 
 std::optional<std::int64_t> getIntIfConstant(mlir::Value value);
diff --git a/lib/Optimizer/Builder/Factory.cpp b/lib/Optimizer/Builder/Factory.cpp
index 1c14ec349a..73b66cdac3 100644
--- a/lib/Optimizer/Builder/Factory.cpp
+++ b/lib/Optimizer/Builder/Factory.cpp
@@ -18,6 +18,9 @@ using namespace mlir;
 
 namespace cudaq::opt {
 
+// The common small struct limit for architectures cudaq is supporting.
+static constexpr unsigned CommonSmallStructSize = 128;
+
 bool factory::isX86_64(ModuleOp module) {
   std::string triple;
   if (auto ta = module->getAttr(targetTripleAttrName))
@@ -302,33 +305,6 @@ cc::LoopOp factory::createMonotonicLoop(
   return loop;
 }
 
-// FIXME: some ABIs may return a small struct in registers rather than via an
-// sret pointer.
-//
-// On x86_64,
-//   pair of:  argument         return value    packed from msb to lsb
-//    i32   :   i64              i64             (second, first)
-//    i64   :   i64, i64         { i64, i64 }
-//    f32   :   <2 x float>      <2 x float>
-//    f64   :   double, double   { double, double }
-//
-// On aarch64,
-//   pair of:  argument         return value    packed from msb to lsb
-//    i32   :   i64              i64             (second, first)
-//    i64   :   [2 x i64]        [2 x i64]
-//    f32   :   [2 x float]      { float, float }
-//    f64   :   [2 x double]     { double, double }
-bool factory::hasHiddenSRet(FunctionType funcTy) {
-  // If a function has more than 1 result, the results are promoted to a
-  // structured return argument. Otherwise, if there is 1 result and it is an
-  // aggregate type, then it is promoted to a structured return argument.
-  auto numResults = funcTy.getNumResults();
-  return numResults > 1 ||
-         (numResults == 1 && funcTy.getResult(0)
-                                 .isa<cc::SpanLikeType, cc::StructType,
-                                      cc::ArrayType, cc::CallableType>());
-}
-
 cc::StructType factory::stlStringType(MLIRContext *ctx) {
   auto i8Ty = IntegerType::get(ctx, 8);
   auto ptrI8Ty = cc::PointerType::get(i8Ty);
@@ -361,8 +337,8 @@ Type factory::getSRetElementType(FunctionType funcTy) {
   auto *ctx = funcTy.getContext();
   if (funcTy.getNumResults() > 1)
     return cc::StructType::get(ctx, funcTy.getResults());
-  if (isa<cc::SpanLikeType>(funcTy.getResult(0)))
-    return getDynamicBufferType(ctx);
+  if (auto spanTy = dyn_cast<cc::SpanLikeType>(funcTy.getResult(0)))
+    return stlVectorType(spanTy.getElementType());
   return funcTy.getResult(0);
 }
 
@@ -403,33 +379,49 @@ static Type convertToHostSideType(Type ty) {
 // function tries to simulate GCC argument passing conventions. classify() also
 // has a number of FIXME comments, where it diverges from the referenced ABI.
 // Empirical evidence show that on x86_64, integers and floats are packed in
-// integers of size 32 or 64 together, unless the float member fits by itself.
+// integers of size 8, 16, 24, 32 or 64 together, unless the float member fits
+// by itself.
 static bool shouldExpand(SmallVectorImpl<Type> &packedTys,
                          cc::StructType structTy) {
   if (structTy.isEmpty())
     return false;
   auto *ctx = structTy.getContext();
   unsigned bits = 0;
+  auto scaleBits = [&](unsigned size) {
+    if (size < 32)
+      size = (size + 7) & ~7u;
+    if (size > 32 && size <= 64)
+      size = 64;
+    return size;
+  };
 
   // First split the members into a "lo" set and a "hi" set.
   SmallVector<Type> set1;
   SmallVector<Type> set2;
   for (auto ty : structTy.getMembers()) {
     if (auto intTy = dyn_cast<IntegerType>(ty)) {
-      bits += intTy.getWidth();
-      if (bits <= 64)
+      auto addBits = scaleBits(intTy.getWidth());
+      if (bits + addBits <= 64) {
+        bits += addBits;
         set1.push_back(ty);
-      else
+      } else {
+        bits = std::max(bits, 64u) + addBits;
         set2.push_back(ty);
+      }
     } else if (auto fltTy = dyn_cast<FloatType>(ty)) {
-      bits += fltTy.getWidth();
-      if (bits <= 64)
+      auto addBits = fltTy.getWidth();
+      if (bits + addBits <= 64) {
+        bits += addBits;
         set1.push_back(ty);
-      else
+      } else {
+        bits = std::max(bits, 64u) + addBits;
         set2.push_back(ty);
+      }
     } else {
       return false;
     }
+    if (bits > CommonSmallStructSize)
+      return false;
   }
 
   // Process the sets. If the set has anything integral, use integer. If the set
@@ -441,12 +433,23 @@ static bool shouldExpand(SmallVectorImpl<Type> &packedTys,
         return true;
     return false;
   };
+  auto intSetSize = [&](auto theSet) {
+    unsigned size = 0;
+    for (auto ty : theSet)
+      size += scaleBits(ty.getIntOrFloatBitWidth());
+    return size;
+  };
   auto processMembers = [&](auto theSet, unsigned packIdx) {
     if (useInt(theSet)) {
-      packedTys[packIdx] = IntegerType::get(ctx, bits > 32 ? 64 : 32);
+      auto size = intSetSize(theSet);
+      if (size <= 32)
+        packedTys[packIdx] = IntegerType::get(ctx, size);
+      else
+        packedTys[packIdx] = IntegerType::get(ctx, 64);
     } else if (theSet.size() == 1) {
       packedTys[packIdx] = theSet[0];
     } else {
+      assert(theSet[0] == FloatType::getF32(ctx) && "must be float");
       packedTys[packIdx] =
           VectorType::get(ArrayRef<std::int64_t>{2}, theSet[0]);
     }
@@ -454,15 +457,59 @@ static bool shouldExpand(SmallVectorImpl<Type> &packedTys,
   assert(!set1.empty() && "struct must have members");
   packedTys.resize(set2.empty() ? 1 : 2);
   processMembers(set1, 0);
-  if (!set2.empty())
-    processMembers(set2, 1);
+  if (set2.empty())
+    return false;
+  processMembers(set2, 1);
   return true;
 }
 
+bool factory::hasSRet(func::FuncOp funcOp) {
+  if (funcOp.getNumArguments() > 0)
+    if (auto dict = funcOp.getArgAttrDict(0))
+      return dict.contains(LLVM::LLVMDialect::getStructRetAttrName());
+  return false;
+}
+
+// On x86_64,
+//   pair of:  argument         return value    packed from msb to lsb
+//    i32   :   i64              i64             (second, first)
+//    i64   :   i64, i64         { i64, i64 }
+//    f32   :   <2 x float>      <2 x float>
+//    f64   :   double, double   { double, double }
+//    ptr   :   ptr, ptr         { ptr, ptr }
+//
+// On aarch64,
+//   pair of:  argument         return value    packed from msb to lsb
+//    i32   :   i64              i64             (second, first)
+//    i64   :   [2 x i64]        [2 x i64]
+//    f32   :   [2 x float]      { float, float }
+//    f64   :   [2 x double]     { double, double }
+//    ptr   :   [2 x i64]        [2 x i64]
+bool factory::hasHiddenSRet(FunctionType funcTy) {
+  // If a function has more than 1 result, the results are promoted to a
+  // structured return argument. Otherwise, if there is 1 result and it is an
+  // aggregate type, then it is promoted to a structured return argument.
+  auto numResults = funcTy.getNumResults();
+  if (numResults == 0)
+    return false;
+  if (numResults > 1)
+    return true;
+  auto resTy = funcTy.getResult(0);
+  if (resTy.isa<cc::SpanLikeType, cc::ArrayType, cc::CallableType>())
+    return true;
+  if (auto strTy = dyn_cast<cc::StructType>(resTy)) {
+    SmallVector<Type> packedTys;
+    bool inRegisters = shouldExpand(packedTys, strTy) || !packedTys.empty();
+    return !inRegisters;
+  }
+  return false;
+}
+
 bool factory::structUsesTwoArguments(mlir::Type ty) {
   // Unchecked! This is only valid if target is X86-64.
   auto structTy = dyn_cast<cc::StructType>(ty);
-  if (!structTy || structTy.getBitSize() == 0 || structTy.getBitSize() > 128)
+  if (!structTy || structTy.getBitSize() == 0 ||
+      structTy.getBitSize() > CommonSmallStructSize)
     return false;
   SmallVector<Type> unused;
   return shouldExpand(unused, structTy);
@@ -486,14 +533,32 @@ FunctionType factory::toHostSideFuncType(FunctionType funcTy, bool addThisPtr,
   auto *ctx = funcTy.getContext();
   SmallVector<Type> inputTys;
   bool hasSRet = false;
-  if (factory::hasHiddenSRet(funcTy)) {
-    // When the kernel is returning a std::vector<T> result, the result is
-    // returned via a sret argument in the first position. When this argument
-    // is added, the this pointer becomes the second argument. Both are opaque
-    // pointers at this point.
-    auto eleTy = convertToHostSideType(getSRetElementType(funcTy));
-    inputTys.push_back(cc::PointerType::get(eleTy));
-    hasSRet = true;
+  Type resultTy;
+  if (funcTy.getNumResults() == 1)
+    if (auto strTy = dyn_cast<cc::StructType>(funcTy.getResult(0)))
+      if (strTy.getBitSize() != 0 &&
+          strTy.getBitSize() <= CommonSmallStructSize) {
+        SmallVector<Type, 2> packedTys;
+        if (shouldExpand(packedTys, strTy) || !packedTys.empty()) {
+          if (packedTys.size() == 1)
+            resultTy = packedTys[0];
+          else
+            resultTy = cc::StructType::get(ctx, packedTys);
+        }
+      }
+  if (!resultTy && funcTy.getNumResults()) {
+    if (factory::hasHiddenSRet(funcTy)) {
+      // When the kernel is returning a std::vector<T> result, the result is
+      // returned via a sret argument in the first position. When this argument
+      // is added, the this pointer becomes the second argument. Both are opaque
+      // pointers at this point.
+      auto eleTy = convertToHostSideType(getSRetElementType(funcTy));
+      inputTys.push_back(cc::PointerType::get(eleTy));
+      hasSRet = true;
+    } else {
+      assert(funcTy.getNumResults() == 1);
+      resultTy = funcTy.getResult(0);
+    }
   }
   // If this kernel is a plain old function or a static member function, we
   // don't want to add a hidden `this` argument.
@@ -509,20 +574,25 @@ FunctionType factory::toHostSideFuncType(FunctionType funcTy, bool addThisPtr,
       // On x86_64 and aarch64, a struct that is smaller than 128 bits may be
       // passed in registers as separate arguments. See classifyArgumentType()
       // in CodeGen/TargetInfo.cpp.
-      if (strTy.getBitSize() != 0 && strTy.getBitSize() <= 128) {
+      if (strTy.getBitSize() != 0 &&
+          strTy.getBitSize() <= CommonSmallStructSize) {
         if (isX86_64(module)) {
           SmallVector<Type, 2> packedTys;
           if (shouldExpand(packedTys, strTy)) {
             for (auto ty : packedTys)
               inputTys.push_back(ty);
             continue;
+          } else if (!packedTys.empty()) {
+            for (auto ty : packedTys)
+              inputTys.push_back(ty);
+            continue;
           }
         } else {
           assert(isAArch64(module) && "aarch64 expected");
           if (onlyArithmeticMembers(strTy)) {
             // Empirical evidence shows that on aarch64, arguments are packed
-            // into a single i64 or a [2 x i64] typed value based on the size of
-            // the struct. This is regardless of whether the value(s) are
+            // into a single i64 or a [2 x i64] typed value based on the size
+            // of the struct. This is regardless of whether the value(s) are
             // floating-point or not.
             if (strTy.getBitSize() > 64)
               inputTys.push_back(cc::ArrayType::get(ctx, i64Ty, 2));
@@ -542,8 +612,8 @@ FunctionType factory::toHostSideFuncType(FunctionType funcTy, bool addThisPtr,
   // and it hasn't been converted to a hidden sret argument.
   if (funcTy.getNumResults() == 0 || hasSRet)
     return FunctionType::get(ctx, inputTys, {});
-  assert(funcTy.getNumResults() == 1);
-  return FunctionType::get(ctx, inputTys, funcTy.getResults());
+  assert(funcTy.getNumResults() == 1 && resultTy);
+  return FunctionType::get(ctx, inputTys, resultTy);
 }
 
 bool factory::isStdVecArg(Type type) {
diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp
index 7d693921f1..a4667ce7b5 100644
--- a/lib/Optimizer/Transforms/GenKernelExecution.cpp
+++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp
@@ -251,7 +251,7 @@ class GenerateKernelExecution
         builder, loc, cudaq::cc::PointerType::get(i8Ty), fromBuff);
     builder.create<func::CallOp>(
         loc, std::nullopt, cudaq::llvmMemCopyIntrinsic,
-        SmallVector<Value>{outputBuffer, vecFromBuff, bytes, notVolatile});
+        ValueRange{outputBuffer, vecFromBuff, bytes, notVolatile});
     auto i8ArrTy = cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(i8Ty));
     auto buf1 =
         cudaq::opt::factory::createCast(builder, loc, i8ArrTy, outputBuffer);
@@ -538,80 +538,6 @@ class GenerateKernelExecution
     return argsCreatorFunc;
   }
 
-  /// If the kernel has an sret argument, then we rewrite the kernel's signature
-  /// on the target. Note that this requires that the target has the ability to
-  /// pass stack pointers as function arguments. These stack pointers will
-  /// obviously only necessarily be valid to the target executing the kernel.
-  void updateQPUKernelAsSRet(OpBuilder &builder, func::FuncOp funcOp,
-                             FunctionType newFuncTy) {
-    auto funcTy = funcOp.getFunctionType();
-    // We add exactly 1 sret argument regardless of how many fields are folded
-    // into it.
-    assert(newFuncTy.getNumInputs() == funcTy.getNumInputs() + 1 &&
-           "sret should be a single argument");
-    auto *ctx = funcOp.getContext();
-    auto eleTy = cudaq::opt::factory::getSRetElementType(funcTy);
-    NamedAttrList attrs;
-    attrs.set(LLVM::LLVMDialect::getStructRetAttrName(), TypeAttr::get(eleTy));
-    funcOp.insertArgument(0, newFuncTy.getInput(0), attrs.getDictionary(ctx),
-                          funcOp.getLoc());
-    auto elePtrTy = cudaq::cc::PointerType::get(eleTy);
-    OpBuilder::InsertionGuard guard(builder);
-    SmallVector<Operation *> returnsToErase;
-    // Update all func.return to store values to the sret block.
-    funcOp->walk([&](func::ReturnOp retOp) {
-      auto loc = retOp.getLoc();
-      builder.setInsertionPoint(retOp);
-      auto cast = builder.create<cudaq::cc::CastOp>(loc, elePtrTy,
-                                                    funcOp.getArgument(0));
-      if (funcOp.getNumResults() > 1) {
-        for (int i = 0, end = funcOp.getNumResults(); i != end; ++i) {
-          auto mem = builder.create<cudaq::cc::ComputePtrOp>(
-              loc, cudaq::cc::PointerType::get(funcTy.getResult(i)), cast,
-              SmallVector<cudaq::cc::ComputePtrArg>{i});
-          builder.create<cudaq::cc::StoreOp>(loc, retOp.getOperands()[i], mem);
-        }
-      } else if (auto stdvecTy =
-                     dyn_cast<cudaq::cc::SpanLikeType>(funcTy.getResult(0))) {
-        auto stdvec = retOp.getOperands()[0];
-        auto eleTy = [&]() -> Type {
-          // TODO: Fold this conversion into the StdvecDataOp builder. We will
-          // never get a data buffer which is not byte addressable and where
-          // the width is less than 8.
-          if (auto intTy = dyn_cast<IntegerType>(stdvecTy.getElementType()))
-            if (intTy.getWidth() < 8)
-              return builder.getI8Type();
-          return stdvecTy.getElementType();
-        }();
-        auto i8Ty = cudaq::cc::PointerType::get(builder.getI8Type());
-        auto ptrTy = cudaq::cc::PointerType::get(eleTy);
-        auto data = builder.create<cudaq::cc::StdvecDataOp>(loc, ptrTy, stdvec);
-        auto mem0 = builder.create<cudaq::cc::ComputePtrOp>(
-            loc, cudaq::cc::PointerType::get(i8Ty), cast,
-            SmallVector<cudaq::cc::ComputePtrArg>{0});
-        auto mem1 = builder.create<cudaq::cc::CastOp>(
-            loc, cudaq::cc::PointerType::get(ptrTy), mem0);
-        builder.create<cudaq::cc::StoreOp>(loc, data, mem1);
-        auto i64Ty = builder.getI64Type();
-        auto size = builder.create<cudaq::cc::StdvecSizeOp>(loc, i64Ty, stdvec);
-        auto mem2 = builder.create<cudaq::cc::ComputePtrOp>(
-            loc, cudaq::cc::PointerType::get(i64Ty), cast,
-            SmallVector<cudaq::cc::ComputePtrArg>{1});
-        builder.create<cudaq::cc::StoreOp>(loc, size, mem2);
-      } else {
-        builder.create<cudaq::cc::StoreOp>(loc, retOp.getOperands()[0], cast);
-      }
-      builder.create<func::ReturnOp>(loc);
-      returnsToErase.push_back(retOp);
-    });
-    for (auto *op : returnsToErase)
-      op->erase();
-    for (std::size_t i = 0, end = funcOp.getNumResults(); i != end; ++i)
-      funcOp.eraseResult(0);
-    modifiedDevKernels.insert(
-        std::pair<StringRef, Type>{funcOp.getName(), newFuncTy.getInput(0)});
-  }
-
   /// In the thunk, we need to unpack any `std::vector` objects encoded in the
   /// packet. Since these have dynamic size, they are encoded as trailing bytes
   /// by offset and size. The offset is implicit from the values of the
@@ -821,58 +747,23 @@ class GenerateKernelExecution
     // Unpack the arguments in the struct and build the argument list for
     // the call to the kernel code.
     SmallVector<Value> args;
-    const bool hiddenSRet = cudaq::opt::factory::hasHiddenSRet(funcTy);
-    FunctionType newFuncTy = [&]() {
-      if (hiddenSRet) {
-        auto sretPtrTy = cudaq::cc::PointerType::get(
-            cudaq::opt::factory::getSRetElementType(funcTy));
-        SmallVector<Type> inputTys = {sretPtrTy};
-        inputTys.append(funcTy.getInputs().begin(), funcTy.getInputs().end());
-        return FunctionType::get(ctx, inputTys, {});
-      }
-      return funcTy;
-    }();
-    int offset = funcTy.getNumInputs();
-    if (hiddenSRet) {
-      // Use the end of the argument block for the return values.
-      auto eleTy = structTy.getMember(offset);
-      auto mem = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, cudaq::cc::PointerType::get(eleTy), castOp,
-          SmallVector<cudaq::cc::ComputePtrArg>{offset});
-      auto sretPtrTy = cudaq::cc::PointerType::get(
-          cudaq::opt::factory::getSRetElementType(funcTy));
-      auto sretMem = builder.create<cudaq::cc::CastOp>(loc, sretPtrTy, mem);
-      args.push_back(sretMem);
-
-      // Rewrite the original kernel's signature and return op(s).
-      updateQPUKernelAsSRet(builder, funcOp, newFuncTy);
-    }
+    const std::int32_t offset = funcTy.getNumInputs();
     for (auto inp : llvm::enumerate(funcTy.getInputs())) {
       auto [a, t] = processInputValue(loc, builder, trailingData, val,
                                       inp.value(), inp.index(), structTy);
       trailingData = t;
       args.push_back(a);
     }
-    auto call = builder.create<func::CallOp>(loc, newFuncTy.getResults(),
+    auto call = builder.create<func::CallOp>(loc, funcTy.getResults(),
                                              funcOp.getName(), args);
-    // If and only if the kernel returns non-sret results, then take those
-    // values and store them in the results section of the struct. They will
-    // eventually be returned to the original caller.
-    if (!hiddenSRet && funcTy.getNumResults() == 1) {
-      auto eleTy = structTy.getMember(offset);
-      auto mem = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, cudaq::cc::PointerType::get(eleTy), castOp,
-          SmallVector<cudaq::cc::ComputePtrArg>{offset});
-      builder.create<cudaq::cc::StoreOp>(loc, call.getResult(0), mem);
-    }
-
-    // If the original result was a std::vector<T>, then depending on whether
-    // this is client-server or not, the thunk function packs the dynamic return
-    // data into a message buffer or just returns a pointer to the shared heap
-    // allocation, resp.
-    bool hasVectorResult = funcTy.getNumResults() == 1 &&
-                           isa<cudaq::cc::SpanLikeType>(funcTy.getResult(0));
+    const bool hasVectorResult =
+        funcTy.getNumResults() == 1 &&
+        isa<cudaq::cc::SpanLikeType>(funcTy.getResult(0));
     if (hasVectorResult) {
+      // If the original result was a std::vector<T>, then depending on whether
+      // this is client-server or not, the thunk function packs the dynamic
+      // return data into a message buffer or just returns a pointer to the
+      // shared heap allocation, resp.
       auto *currentBlock = builder.getBlock();
       auto *reg = currentBlock->getParent();
       auto *thenBlock = builder.createBlock(reg);
@@ -881,23 +772,53 @@ class GenerateKernelExecution
       builder.create<cf::CondBranchOp>(loc, isClientServer, thenBlock,
                                        elseBlock);
       builder.setInsertionPointToEnd(thenBlock);
-      int offset = funcTy.getNumInputs();
       auto gepRes = builder.create<cudaq::cc::ComputePtrOp>(
           loc, cudaq::cc::PointerType::get(structTy.getMember(offset)), castOp,
-          SmallVector<cudaq::cc::ComputePtrArg>{offset});
-      auto gepRes2 = builder.create<cudaq::cc::CastOp>(
+          ArrayRef<cudaq::cc::ComputePtrArg>{offset});
+      auto resAsVec = builder.create<cudaq::cc::CastOp>(
+          loc, cudaq::cc::PointerType::get(funcTy.getResult(0)), gepRes);
+      builder.create<cudaq::cc::StoreOp>(loc, call.getResult(0), resAsVec);
+      auto resAsArg = builder.create<cudaq::cc::CastOp>(
           loc, cudaq::cc::PointerType::get(thunkTy.getResults()[0]), gepRes);
       // createDynamicResult packs the input values and the dynamic results
       // into a single buffer to pass back as a message.
       auto res = builder.create<func::CallOp>(
           loc, thunkTy.getResults()[0], "__nvqpp_createDynamicResult",
-          ValueRange{thunkEntry->getArgument(0), structSize, gepRes2});
+          ValueRange{thunkEntry->getArgument(0), structSize, resAsArg});
       builder.create<func::ReturnOp>(loc, res.getResult(0));
       builder.setInsertionPointToEnd(elseBlock);
+      auto eleTy = structTy.getMember(offset);
+      auto memTy = cudaq::cc::PointerType::get(eleTy);
+      auto mem = builder.create<cudaq::cc::ComputePtrOp>(
+          loc, memTy, castOp, SmallVector<cudaq::cc::ComputePtrArg>{offset});
+      auto resPtrTy = cudaq::cc::PointerType::get(call.getResult(0).getType());
+      auto castMem = builder.create<cudaq::cc::CastOp>(loc, resPtrTy, mem);
+      builder.create<cudaq::cc::StoreOp>(loc, call.getResult(0), castMem);
+    } else {
+      // FIXME: Should check for recursive vector case.
+      // If the kernel returns non-dynamic results (no spans), then take those
+      // values and store them in the results section of the struct. They will
+      // eventually be returned to the original caller.
+      if (funcTy.getNumResults()) {
+        for (std::int32_t o = 0;
+             o < static_cast<std::int32_t>(funcTy.getNumResults()); ++o) {
+          auto eleTy = structTy.getMember(offset + o);
+          auto memTy = cudaq::cc::PointerType::get(eleTy);
+          auto mem = builder.create<cudaq::cc::ComputePtrOp>(
+              loc, memTy, castOp,
+              SmallVector<cudaq::cc::ComputePtrArg>{offset + o});
+          auto resTy = call.getResult(o).getType();
+          auto resPtrTy = cudaq::cc::PointerType::get(resTy);
+          Value castMem = mem;
+          if (resPtrTy != mem.getType())
+            castMem = builder.create<cudaq::cc::CastOp>(loc, resPtrTy, mem);
+          builder.create<cudaq::cc::StoreOp>(loc, call.getResult(o), castMem);
+        }
+      }
     }
     // zeroDynamicResult is used by models other than client-server. It assumes
-    // that no messages need to be sent, the CPU and QPU code share a memory
-    // space, and therefore skips making any copies.
+    // that no messages need to be sent and that the CPU and QPU code share a
+    // memory space. Therefore, making any copies can be skipped.
     auto zeroRes =
         builder.create<func::CallOp>(loc, thunkTy.getResults()[0],
                                      "__nvqpp_zeroDynamicResult", ValueRange{});
@@ -1125,11 +1046,10 @@ class GenerateKernelExecution
                             func::FuncOp thunkFunc) {
     auto *ctx = builder.getContext();
     auto i64Ty = builder.getI64Type();
-    auto offset = devFuncTy.getNumInputs();
+    std::int32_t offset = devFuncTy.getNumInputs();
     auto thunkTy = getThunkType(ctx);
     auto structPtrTy = cudaq::cc::PointerType::get(structTy);
     Block *hostFuncEntryBlock = hostFunc.addEntryBlock();
-    const bool hiddenSRet = cudaq::opt::factory::hasHiddenSRet(devFuncTy);
 
     OpBuilder::InsertionGuard guard(builder);
     builder.setInsertionPointToStart(hostFuncEntryBlock);
@@ -1170,7 +1090,7 @@ class GenerateKernelExecution
         // launch kernel.
         if (isa<cudaq::cc::IndirectCallableType>(quakeTy)) {
           auto kernKey = builder.create<func::CallOp>(
-              loc, builder.getI64Type(), cudaq::runtime::getLinkableKernelKey,
+              loc, i64Ty, cudaq::runtime::getLinkableKernelKey,
               ValueRange{arg});
           stVal = builder.create<cudaq::cc::InsertValueOp>(
               loc, stVal.getType(), stVal, kernKey.getResult(0), idx);
@@ -1308,8 +1228,8 @@ class GenerateKernelExecution
           std::int32_t idx = inp.index();
           Type quakeTy = devFuncTy.getInput(idx);
           if (auto stdvecTy = dyn_cast<cudaq::cc::SpanLikeType>(quakeTy)) {
-            auto bytes = builder.create<cudaq::cc::ExtractValueOp>(
-                loc, builder.getI64Type(), stVal, idx);
+            auto bytes = builder.create<cudaq::cc::ExtractValueOp>(loc, i64Ty,
+                                                                   stVal, idx);
             assert(stdvecTy == devFuncTy.getInput(idx));
             auto ptrInTy = cast<cudaq::cc::PointerType>(inTy);
             vecToBuffer = encodeVectorData(loc, builder, bytes, stdvecTy, arg,
@@ -1351,7 +1271,6 @@ class GenerateKernelExecution
           loc, cudaq::opt::factory::stlVectorType(ptrI8Ty));
       auto arrPtrTy = cudaq::cc::ArrayType::get(ctx, ptrI8Ty, count);
       Value buffer = builder.create<cudaq::cc::AllocaOp>(loc, arrPtrTy);
-      auto i64Ty = builder.getI64Type();
       auto buffSize = builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, arrPtrTy);
       auto ptrPtrTy = cudaq::cc::PointerType::get(ptrI8Ty);
       auto cast1 = builder.create<cudaq::cc::CastOp>(loc, ptrPtrTy, buffer);
@@ -1458,61 +1377,76 @@ class GenerateKernelExecution
     // result value(s) from the struct returned by `launchKernel` and return
     // them to our caller.
     SmallVector<Value> results;
-    const bool multiResult = devFuncTy.getResults().size() > 1;
-    for (auto res : llvm::enumerate(devFuncTy.getResults())) {
-      int off = res.index() + offset;
-      if (auto vecTy = dyn_cast<cudaq::cc::SpanLikeType>(res.value())) {
-        auto eleTy = vecTy.getElementType();
-        auto ptrTy = cudaq::cc::PointerType::get(eleTy);
-        auto gep0 = builder.create<cudaq::cc::ComputePtrOp>(
-            loc, cudaq::cc::PointerType::get(ptrTy), temp,
-            SmallVector<cudaq::cc::ComputePtrArg>{0, off, 0});
-        auto dataPtr = builder.create<cudaq::cc::LoadOp>(loc, gep0);
-        auto lenPtrTy = cudaq::cc::PointerType::get(builder.getI64Type());
-        auto gep1 = builder.create<cudaq::cc::ComputePtrOp>(
-            loc, lenPtrTy, temp,
-            SmallVector<cudaq::cc::ComputePtrArg>{0, off, 1});
-        auto vecLen = builder.create<cudaq::cc::LoadOp>(loc, gep1);
-        if (vecTy.getElementType() == builder.getI1Type()) {
-          genStdvecBoolFromInitList(loc, builder,
-                                    hostFuncEntryBlock->getArguments().front(),
-                                    dataPtr, vecLen);
-        } else {
-          cudaq::IRBuilder irBuilder(builder);
-          Value tSize = irBuilder.getByteSizeOfType(loc, eleTy);
-          if (!tSize) {
-            TODO_loc(loc, "unhandled vector element type");
-            return;
-          }
-          genStdvecTFromInitList(loc, builder,
-                                 hostFuncEntryBlock->getArguments().front(),
-                                 dataPtr, tSize, vecLen);
-        }
-        offset++;
+    auto hostFuncTy = hostFunc.getFunctionType();
+    assert((hostFuncTy.getResults().empty() ||
+            (hostFuncTy.getNumResults() == 1)) &&
+           "C++ function expected to have 0 or 1 return value");
+    const bool resultVal = !hostFuncTy.getResults().empty();
+    if (resultVal || cudaq::opt::factory::hasSRet(hostFunc)) {
+      // Host function returns a value. Either returning by value or via an sret
+      // reference.
+      if (resultVal) {
+        Type res0Ty = structTy.getMember(offset);
+        auto ptrResTy = cudaq::cc::PointerType::get(res0Ty);
+        auto resPtr = builder.create<cudaq::cc::ComputePtrOp>(
+            loc, ptrResTy, temp, ArrayRef<cudaq::cc::ComputePtrArg>{0, offset});
+        Type castToTy = cudaq::cc::PointerType::get(hostFuncTy.getResult(0));
+        auto castResPtr = [&]() -> Value {
+          if (castToTy == ptrResTy)
+            return resPtr;
+          return builder.create<cudaq::cc::CastOp>(loc, castToTy, resPtr);
+        }();
+        results.push_back(builder.create<cudaq::cc::LoadOp>(loc, castResPtr));
       } else {
-        auto gep0 = builder.create<cudaq::cc::ComputePtrOp>(
-            loc, cudaq::cc::PointerType::get(structTy.getMember(off)), temp,
-            SmallVector<cudaq::cc::ComputePtrArg>{0, off});
-        auto gep = cudaq::opt::factory::createCast(
-            builder, loc, cudaq::cc::PointerType::get(res.value()), gep0);
-        Value loadVal = builder.create<cudaq::cc::LoadOp>(loc, gep);
-        if (hiddenSRet) {
-          auto sretPtr = [&]() -> Value {
-            if (multiResult)
-              return builder.create<cudaq::cc::ComputePtrOp>(
-                  loc, cudaq::cc::PointerType::get(res.value()),
-                  hostFuncEntryBlock->getArguments().front(),
-                  SmallVector<cudaq::cc::ComputePtrArg>{off});
-            return builder.create<cudaq::cc::CastOp>(
-                loc, cudaq::cc::PointerType::get(res.value()),
-                hostFuncEntryBlock->getArguments().front());
-          }();
-          builder.create<cudaq::cc::StoreOp>(loc, loadVal, sretPtr);
+        // Check if device is returning a span. If it is, then we will need to
+        // convert it to a std::vector here. The vector is constructed in-place
+        // on the sret memory block.
+        Value arg0 = hostFuncEntryBlock->getArguments().front();
+        if (auto spanTy =
+                dyn_cast<cudaq::cc::SpanLikeType>(devFuncTy.getResult(0))) {
+          auto eleTy = spanTy.getElementType();
+          auto ptrTy = cudaq::cc::PointerType::get(eleTy);
+          auto gep0 = builder.create<cudaq::cc::ComputePtrOp>(
+              loc, cudaq::cc::PointerType::get(ptrTy), temp,
+              SmallVector<cudaq::cc::ComputePtrArg>{0, offset, 0});
+          auto dataPtr = builder.create<cudaq::cc::LoadOp>(loc, gep0);
+          auto lenPtrTy = cudaq::cc::PointerType::get(i64Ty);
+          auto gep1 = builder.create<cudaq::cc::ComputePtrOp>(
+              loc, lenPtrTy, temp,
+              SmallVector<cudaq::cc::ComputePtrArg>{0, offset, 1});
+          auto vecLen = builder.create<cudaq::cc::LoadOp>(loc, gep1);
+          if (spanTy.getElementType() == builder.getI1Type()) {
+            genStdvecBoolFromInitList(loc, builder, arg0, dataPtr, vecLen);
+          } else {
+            Value tSize =
+                builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, eleTy);
+            genStdvecTFromInitList(loc, builder, arg0, dataPtr, tSize, vecLen);
+          }
         } else {
-          results.push_back(loadVal);
+          // Otherwise, we can just copy the aggregate into the sret memory
+          // block. Uses the size of the host function's sret pointer element
+          // type for the memcpy, so the device should return an (aggregate)
+          // value of suitable size.
+          Type res0Ty = structTy.getMember(offset);
+          auto ptrResTy = cudaq::cc::PointerType::get(res0Ty);
+          auto resPtr = builder.create<cudaq::cc::ComputePtrOp>(
+              loc, ptrResTy, temp,
+              ArrayRef<cudaq::cc::ComputePtrArg>{0, offset});
+          auto castMsgBuff =
+              builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, resPtr);
+          Type eleTy =
+              cast<cudaq::cc::PointerType>(arg0.getType()).getElementType();
+          Value bytes = builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, eleTy);
+          auto notVolatile = builder.create<arith::ConstantIntOp>(loc, 0, 1);
+          auto castArg0 = builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, arg0);
+          builder.create<func::CallOp>(
+              loc, std::nullopt, cudaq::llvmMemCopyIntrinsic,
+              ValueRange{castArg0, castMsgBuff, bytes, notVolatile});
         }
       }
     }
+
+    // Return the result (if any).
     builder.create<func::ReturnOp>(loc, results);
   }
 
@@ -1779,45 +1713,11 @@ class GenerateKernelExecution
       cudaq::opt::factory::createGlobalCtorCall(
           module, FlatSymbolRefAttr::get(ctx, initFun.getName()));
 
-      SmallVector<Operation *> deadCalls;
-      module.walk([&](func::CallOp call) {
-        if (!call.getResults().empty()) {
-          auto callee = call.getCallee();
-          auto iter = modifiedDevKernels.find(callee);
-          if (iter != modifiedDevKernels.end()) {
-            OpBuilder builder(call);
-            Type ty = call.getResult(0).getType();
-            auto loc = call.getLoc();
-            auto strTy = cast<cudaq::cc::StructType>(
-                cast<cudaq::cc::PointerType>(iter->second).getElementType());
-            auto buff = builder.create<cudaq::cc::AllocaOp>(loc, strTy);
-            SmallVector<Value> args = {buff};
-            args.append(call.getOperands().begin(), call.getOperands().end());
-            builder.create<func::CallOp>(loc, TypeRange{}, callee, args);
-            auto buffPtrPtr = builder.create<cudaq::cc::ComputePtrOp>(
-                loc, cudaq::cc::PointerType::get(strTy.getMember(0)), buff,
-                ArrayRef<cudaq::cc::ComputePtrArg>{0});
-            auto buffPtr = builder.create<cudaq::cc::LoadOp>(loc, buffPtrPtr);
-            auto buffSizePtr = builder.create<cudaq::cc::ComputePtrOp>(
-                loc, cudaq::cc::PointerType::get(strTy.getMember(1)), buff,
-                ArrayRef<cudaq::cc::ComputePtrArg>{1});
-            auto buffSize = builder.create<cudaq::cc::LoadOp>(loc, buffSizePtr);
-            auto sv = builder.create<cudaq::cc::StdvecInitOp>(loc, ty, buffPtr,
-                                                              buffSize);
-            call.getResult(0).replaceAllUsesWith(sv);
-            deadCalls.push_back(call);
-          }
-        }
-      });
-      for (auto *op : deadCalls)
-        op->erase();
-
       LLVM_DEBUG(llvm::dbgs() << "final module:\n" << module << '\n');
     }
     out.keep();
   }
 
   const DataLayout *dataLayout = nullptr;
-  DenseMap<StringRef, Type> modifiedDevKernels;
 };
 } // namespace
diff --git a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
index ba4a87c29e..9328b78896 100644
--- a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
+++ b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
@@ -191,8 +191,10 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
         toErase.push_back(user);
     }
     if (toGlobal) {
-      rewriter.setInsertionPointAfter(alloc);
-      rewriter.replaceOp(alloc, conGlobal);
+      if (conGlobal) {
+        rewriter.setInsertionPointAfter(alloc);
+        rewriter.replaceOp(alloc, conGlobal);
+      }
     } else {
       toErase.push_back(alloc);
     }
diff --git a/targettests/execution/auto_kernel-cpp17.cpp b/targettests/execution/auto_kernel-cpp17.cpp
index f3b2f3dc65..04b0353113 100644
--- a/targettests/execution/auto_kernel-cpp17.cpp
+++ b/targettests/execution/auto_kernel-cpp17.cpp
@@ -7,7 +7,7 @@
  ******************************************************************************/
 
 // REQUIRES: c++17
-// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
+// RUN: nvq++ -std=c++17 --enable-mlir %s -o %t && %t | FileCheck %s
 
 #include <cudaq.h>
 
diff --git a/targettests/execution/auto_kernel.cpp b/targettests/execution/auto_kernel.cpp
index f52b13a7f0..1aec262e2a 100644
--- a/targettests/execution/auto_kernel.cpp
+++ b/targettests/execution/auto_kernel.cpp
@@ -7,7 +7,7 @@
  ******************************************************************************/
 
 // REQUIRES: c++20
-// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --enable-mlir %s -o %t && %t | FileCheck %s
 
 #include <cudaq.h>
 
diff --git a/test/AST-Quake/calling_convention.cpp b/test/AST-Quake/calling_convention.cpp
new file mode 100644
index 0000000000..3d2c6e2e4a
--- /dev/null
+++ b/test/AST-Quake/calling_convention.cpp
@@ -0,0 +1,335 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// This test is only valid for x86_64.
+// RUN: if [ `uname -m` = "x86_64" ] ; then \
+// RUN: cudaq-quake %cpp_std %s | cudaq-opt | FileCheck %s ; fi
+
+#include <cudaq.h>
+#include <tuple>
+#include <vector>
+
+// Tests the host-side signatures of various spec supported kernel arguments and
+// results. This file tests the x86_64 calling convention. Other architectures
+// differ in their calling conventions.
+
+//===----------------------------------------------------------------------===//
+// test all the basic arithmetic types to deny any regressions.
+
+struct T0 {
+  void operator()() __qpu__ {}
+};
+
+struct T1 {
+  void operator()(double arg) __qpu__ {}
+};
+
+struct T2 {
+  void operator()(float arg) __qpu__ {}
+};
+
+struct T3 {
+  void operator()(long long arg) __qpu__ {}
+};
+
+struct T4 {
+  void operator()(long arg) __qpu__ {}
+};
+
+struct T5 {
+  void operator()(int arg) __qpu__ {}
+};
+
+struct T6 {
+  void operator()(short arg) __qpu__ {}
+};
+
+struct T7 {
+  void operator()(char arg) __qpu__ {}
+};
+
+struct T8 {
+  void operator()(bool arg) __qpu__ {}
+};
+
+// CHECK-LABEL:  func.func @_ZN2T0clEv(
+// CHECK-SAME:    %[[VAL_0:.*]]: !cc.ptr<i8>) {
+// CHECK-LABEL:  func.func @_ZN2T1clEd(
+// CHECK-SAME:    %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: f64) {
+// CHECK-LABEL:  func.func @_ZN2T2clEf(
+// CHECK-SAME:    %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: f32) {
+// CHECK-LABEL:  func.func @_ZN2T3clEx(
+// CHECK-SAME:    %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i64) {
+// CHECK-LABEL:  func.func @_ZN2T4clEl(
+// CHECK-SAME:    %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i64) {
+// CHECK-LABEL:  func.func @_ZN2T5clEi(
+// CHECK-SAME:    %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i32) {
+// CHECK-LABEL:  func.func @_ZN2T6clEs(
+// CHECK-SAME:    %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i16) {
+// CHECK-LABEL:  func.func @_ZN2T7clEc(
+// CHECK-SAME:    %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i8) {
+// CHECK-LABEL:  func.func @_ZN2T8clEb(
+// CHECK-SAME:    %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i1) {
+
+struct R0 {
+  void operator()() __qpu__ {}
+};
+
+struct R1 {
+  double operator()() __qpu__ { return {}; }
+};
+
+struct R2 {
+  float operator()() __qpu__ { return {}; }
+};
+
+struct R3 {
+  long long operator()() __qpu__ { return {}; }
+};
+
+struct R4 {
+  long operator()() __qpu__ { return {}; }
+};
+
+struct R5 {
+  int operator()() __qpu__ { return {}; }
+};
+
+struct R6 {
+  short operator()() __qpu__ { return {}; }
+};
+
+struct R7 {
+  char operator()() __qpu__ { return {}; }
+};
+
+struct R8 {
+  bool operator()() __qpu__ { return {}; }
+};
+
+// CHECK-LABEL:  func.func @_ZN2R0clEv(%arg0: !cc.ptr<i8>) {
+// CHECK-LABEL:  func.func @_ZN2R1clEv(%arg0: !cc.ptr<i8>) -> f64 {
+// CHECK-LABEL:  func.func @_ZN2R2clEv(%arg0: !cc.ptr<i8>) -> f32 {
+// CHECK-LABEL:  func.func @_ZN2R3clEv(%arg0: !cc.ptr<i8>) -> i64 {
+// CHECK-LABEL:  func.func @_ZN2R4clEv(%arg0: !cc.ptr<i8>) -> i64 {
+// CHECK-LABEL:  func.func @_ZN2R5clEv(%arg0: !cc.ptr<i8>) -> i32 {
+// CHECK-LABEL:  func.func @_ZN2R6clEv(%arg0: !cc.ptr<i8>) -> i16 {
+// CHECK-LABEL:  func.func @_ZN2R7clEv(%arg0: !cc.ptr<i8>) -> i8 {
+// CHECK-LABEL:  func.func @_ZN2R8clEv(%arg0: !cc.ptr<i8>) -> i1 {
+
+//===----------------------------------------------------------------------===//
+// structs that are less than 128 bits.
+// arguments may be merged into 1 register or passed in pair of registers.
+// results are returned in registers.
+
+struct G0 {
+  std::pair<bool, bool> operator()(std::pair<double, double>) __qpu__ {
+    return {};
+  }
+};
+
+struct G1 {
+  std::pair<bool, char> operator()(std::pair<float, float>) __qpu__ {
+    return {};
+  }
+};
+
+struct G2 {
+  std::pair<char, short> operator()(std::pair<long, long>,
+                                    std::pair<int, double>) __qpu__ {
+    return {};
+  }
+};
+
+struct G3 {
+  std::pair<short, short> operator()(std::pair<double, bool>) __qpu__ {
+    return {};
+  }
+};
+
+struct BB {
+  bool _1;
+  bool _2;
+  bool _3;
+};
+
+BB glue0();
+
+struct G4 {
+  std::pair<int, int> operator()(BB) __qpu__ { return {}; }
+};
+
+struct II {
+  int _1;
+  int _2;
+  int _3;
+};
+
+II glue1();
+
+struct G5 {
+  std::pair<long, float> operator()(II) __qpu__ { return {}; }
+};
+
+struct CC {
+  char _1;
+  unsigned char _2;
+  signed char _3;
+};
+
+CC glue2();
+
+struct G6 {
+  std::pair<long, long> operator()(CC) __qpu__ { return {}; }
+};
+
+struct G7 {
+  BB operator()(BB, II, CC) __qpu__ { return glue0(); }
+};
+
+struct G8 {
+  II operator()(II, CC, BB) __qpu__ { return glue1(); }
+};
+
+struct G9 {
+  CC operator()(CC, BB, II) __qpu__ { return glue2(); }
+};
+
+// clang-format off
+// CHECK-LABEL:  func.func @_ZN2G0clESt4pairIddE(
+// CHECK-SAME:     %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: f64,
+// CHECK-SAME:     %[[VAL_2:.*]]: f64) -> i16
+// CHECK-LABEL:  func.func @_ZN2G1clESt4pairIffE(
+// CHECK-SAME:     %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: vector<2xf32>)
+// CHECK-SAME:     -> i16
+// CHECK-LABEL:  func.func @_ZN2G2clESt4pairIllES0_IidE(
+// CHECK-SAME:     %[[VAL_0:.*]]: !cc.ptr<i8>,
+// CHECK-SAME:     %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i32,
+// CHECK-SAME:     %[[VAL_4:.*]]: f64) -> i24
+// CHECK-LABEL:  func.func @_ZN2G3clESt4pairIdbE(
+// CHECK-SAME:     %[[VAL_1:.*]]: !cc.ptr<i8>, %[[VAL_2:.*]]: f64,
+// CHECK-SAME:     %[[VAL_3:.*]]: i8) -> i32
+// CHECK-LABEL:  func.func @_ZN2G4clE2BB(
+// CHECK-SAME:     %[[VAL_1:.*]]: !cc.ptr<i8>, %[[VAL_2:.*]]: i24) -> i64
+// CHECK-LABEL:  func.func @_ZN2G5clE2II(
+// CHECK-SAME:     %[[VAL_1:.*]]: !cc.ptr<i8>, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i32) -> !cc.struct<{i64, f32}>
+// CHECK-LABEL:  func.func @_ZN2G6clE2CC(
+// CHECK-SAME:     %[[VAL_1:.*]]: !cc.ptr<i8>, %[[VAL_2:.*]]: i24) -> !cc.struct<{i64, i64}>
+// CHECK-LABEL:  func.func @_ZN2G7clE2BB2II2CC(
+// CHECK-SAME:     %[[VAL_1:.*]]: !cc.ptr<i8>, %[[VAL_2:.*]]: i24,
+// CHECK-SAME:     %[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i32,
+// CHECK-SAME:     %[[VAL_5:.*]]: i24) -> i24
+// CHECK-LABEL:  func.func @_ZN2G8clE2II2CC2BB(
+// CHECK-SAME:     %[[VAL_1:.*]]: !cc.ptr<i8>, %[[VAL_2:.*]]: i64,
+// CHECK-SAME:     %[[VAL_3:.*]]: i32, %[[VAL_4:.*]]: i24,
+// CHECK-SAME:     %[[VAL_5:.*]]: i24) -> !cc.struct<{i64, i32}>
+// CHECK-LABEL:  func.func @_ZN2G9clE2CC2BB2II(
+// CHECK-SAME:     %[[VAL_0:.*]]: !cc.ptr<i8>,
+// CHECK-SAME:     %[[VAL_1:.*]]: i24, %[[VAL_2:.*]]: i24,
+// CHECK-SAME:     %[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i32) -> i24
+// clang-format on
+
+//===----------------------------------------------------------------------===//
+// std::vector - these get converted to sret and byval ptrs on host side.
+
+std::vector<int> make_believe();
+
+struct V0 {
+  std::vector<int> operator()() __qpu__ { return make_believe(); }
+};
+
+std::vector<bool> make_coffee();
+
+struct V1 {
+  std::vector<bool> operator()(std::vector<double>) __qpu__ {
+    return make_coffee();
+  }
+};
+
+std::vector<std::pair<char, int>> make_crazy();
+
+struct V2 {
+  std::vector<std::pair<char, int>> operator()(std::vector<float>,
+                                               std::vector<short>) __qpu__ {
+    return make_crazy();
+  }
+};
+
+struct V3 {
+  void operator()(std::vector<long>, std::vector<bool>) __qpu__ {}
+};
+
+// clang-format off
+// CHECK-LABEL:  func.func @_ZN2V0clEv(
+// CHECK-SAME:     %[[VAL_0:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>> {llvm.sret = !cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>},
+// CHECK-SAME:     %[[VAL_1:.*]]: !cc.ptr<i8>)
+// CHECK-LABEL:  func.func @_ZN2V1clESt6vectorIdSaIdEE(
+// CHECK-SAME:     %[[VAL_0:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.ptr<i1>, !cc.ptr<i1>}>> {llvm.sret = !cc.struct<{!cc.ptr<i1>, !cc.ptr<i1>, !cc.ptr<i1>}>},
+// CHECK-SAME:     %[[VAL_1:.*]]: !cc.ptr<i8>,
+// CHECK-SAME:     %[[VAL_2:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f64>}>>)
+// CHECK-LABEL:  func.func @_ZN2V2clESt6vectorIfSaIfEES0_IsSaIsEE(
+// CHECK-SAME:     %[[VAL_0:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<!cc.struct<{i8, i32} [64,4]>>, !cc.ptr<!cc.struct<{i8, i32} [64,4]>>, !cc.ptr<!cc.struct<{i8, i32} [64,4]>>}>> {llvm.sret = !cc.struct<{!cc.ptr<!cc.struct<{i8, i32} [64,4]>>, !cc.ptr<!cc.struct<{i8, i32} [64,4]>>, !cc.ptr<!cc.struct<{i8, i32} [64,4]>>}>},
+// CHECK-SAME:     %[[VAL_1:.*]]: !cc.ptr<i8>,
+// CHECK-SAME:     %[[VAL_2:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<f32>, !cc.ptr<f32>, !cc.ptr<f32>}>>,
+// CHECK-SAME:     %[[VAL_3:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i16>, !cc.ptr<i16>, !cc.ptr<i16>}>>)
+// CHECK-LABEL:  func.func @_ZN2V3clESt6vectorIlSaIlEES0_IbSaIbEE(
+// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr<i8>,
+// CHECK-SAME:     %[[VAL_1:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i64>, !cc.ptr<i64>, !cc.ptr<i64>}>>,
+// CHECK-SAME:     %[[VAL_2:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.ptr<i1>, !cc.ptr<i1>}>>)
+// clang-format on
+
+//===----------------------------------------------------------------------===//
+// structs that are more than 128 bits. These get converted to sret or byval
+// ptrs on the host side.
+
+struct B0 {
+  void operator()(std::tuple<double, int, char, float, short>) __qpu__ {}
+};
+
+struct BG {
+  float _1[4];
+  int _2[5];
+};
+
+BG make_sausage();
+
+struct B1 {
+  BG operator()() __qpu__ { return make_sausage(); }
+};
+
+std::tuple<int, char, float, short, double, double> make_interesting();
+
+struct B2 {
+  std::tuple<int, char, float, short, double, double> operator()(BG) __qpu__ {
+    return make_interesting();
+  }
+};
+
+struct BA {
+  bool _1[64];
+};
+
+struct B3 {
+  BA operator()(BA arg) __qpu__ { return arg; }
+};
+
+// clang-format off
+// CHECK-LABEL:  func.func @_ZN2B0clESt5tupleIJdicfsEE(
+// CHECK-SAME:     %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: !cc.ptr<!cc.struct<{i16, f32, i8, i32, f64}>>) {
+// CHECK-LABEL:  func.func @_ZN2B1clEv(
+// CHECK-SAME:     %[[VAL_0:.*]]: !cc.ptr<!cc.struct<"BG" {!cc.array<f32 x 4>, !cc.array<i32 x 5>} [288,4]>> {llvm.sret = !cc.struct<"BG" {!cc.array<f32 x 4>, !cc.array<i32 x 5>} [288,4]>},
+// CHECK-SAME:     %[[VAL_1:.*]]: !cc.ptr<i8>)
+// CHECK-LABEL:  func.func @_ZN2B2clE2BG(
+// CHECK-SAME:     %[[VAL_0:.*]]: !cc.ptr<!cc.struct<{f64, f64, i16, f32, i8, i32}>> {llvm.sret = !cc.struct<{f64, f64, i16, f32, i8, i32}>},
+// CHECK-SAME:     %[[VAL_1:.*]]: !cc.ptr<i8>,
+// CHECK-SAME:     %[[VAL_2:.*]]: !cc.ptr<!cc.struct<"BG" {!cc.array<f32 x 4>, !cc.array<i32 x 5>} [288,4]>> {llvm.byval = !cc.struct<"BG" {!cc.array<f32 x 4>, !cc.array<i32 x 5>} [288,4]>})
+// CHECK-LABEL:  func.func @_ZN2B3clE2BA(
+// CHECK-SAME:     %[[VAL_0:.*]]: !cc.ptr<!cc.struct<"BA" {!cc.array<i1 x 64>} [512,1]>> {llvm.sret = !cc.struct<"BA" {!cc.array<i1 x 64>} [512,1]>},
+// CHECK-SAME:     %[[VAL_1:.*]]: !cc.ptr<i8>,
+// CHECK-SAME:     %[[VAL_2:.*]]: !cc.ptr<!cc.struct<"BA" {!cc.array<i1 x 64>} [512,1]>> {llvm.byval = !cc.struct<"BA" {!cc.array<i1 x 64>} [512,1]>})
+// clang-format on
diff --git a/test/AST-Quake/vector_int-1.cpp b/test/AST-Quake/vector_int-1.cpp
index 3bdfae634f..a5a989f6bf 100644
--- a/test/AST-Quake/vector_int-1.cpp
+++ b/test/AST-Quake/vector_int-1.cpp
@@ -22,8 +22,7 @@ __qpu__ void touringLondon() {
   return;
 }
 
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_doubleDeckerBus._Z15doubleDeckerBusv(
-// CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>> {llvm.sret = !cc.struct<{!cc.ptr<i8>, i64}>}) attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK-LABEL:  func.func @__nvqpp__mlirgen__function_doubleDeckerBus._Z15doubleDeckerBusv() -> !cc.stdvec<i32> attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
 // CHECK:           %[[VAL_1:.*]] = arith.constant 2 : i64
 // CHECK:           %[[VAL_2:.*]] = arith.constant 4 : i64
 // CHECK:           %[[VAL_3:.*]] = arith.constant 2 : i32
@@ -33,29 +32,15 @@ __qpu__ void touringLondon() {
 // CHECK:           %[[VAL_6:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<i32 x 2>>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_7:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_6]], %[[VAL_1]], %[[VAL_2]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_8:.*]] = cc.stdvec_init %[[VAL_7]], %[[VAL_1]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<i32>
-// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>
-// CHECK:           %[[VAL_10:.*]] = cc.stdvec_data %[[VAL_8]] : (!cc.stdvec<i32>) -> !cc.ptr<i32>
-// CHECK:           %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_9]][0] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.ptr<i8>>) -> !cc.ptr<!cc.ptr<i32>>
-// CHECK:           cc.store %[[VAL_10]], %[[VAL_12]] : !cc.ptr<!cc.ptr<i32>>
-// CHECK:           %[[VAL_13:.*]] = cc.stdvec_size %[[VAL_8]] : (!cc.stdvec<i32>) -> i64
-// CHECK:           %[[VAL_14:.*]] = cc.compute_ptr %[[VAL_9]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<i64>
-// CHECK:           cc.store %[[VAL_13]], %[[VAL_14]] : !cc.ptr<i64>
-// CHECK:           return
+// CHECK:           return %[[VAL_8]] : !cc.stdvec<i32>
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_touringLondon._Z13touringLondonv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_0:.*]] = cc.alloca !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           call @__nvqpp__mlirgen__function_doubleDeckerBus._Z15doubleDeckerBusv(%[[VAL_0]]) : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> ()
-// CHECK:           %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_0]][0] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_1:.*]] = cc.load %[[VAL_10]] : !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_2:.*]] = cc.compute_ptr %[[VAL_0]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<i64>
-// CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr<i64>
-// CHECK:           %[[VAL_4:.*]] = cc.stdvec_init %[[VAL_1]], %[[VAL_3]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<i32>
-// CHECK:           %[[VAL_5:.*]] = cc.stdvec_data %[[VAL_4]] : (!cc.stdvec<i32>) -> !cc.ptr<!cc.array<i32 x ?>>
+// CHECK:           %[[VAL_0:.*]] = call @__nvqpp__mlirgen__function_doubleDeckerBus._Z15doubleDeckerBusv() : () -> !cc.stdvec<i32>
+// CHECK:           %[[VAL_5:.*]] = cc.stdvec_data %[[VAL_0]] : (!cc.stdvec<i32>) -> !cc.ptr<!cc.array<i32 x ?>>
 // CHECK:           %[[VAL_6:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<!cc.array<i32 x ?>>) -> !cc.ptr<i32>
 // CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_8:.*]] = cc.cast signed %[[VAL_7]] : (i32) -> i64
-// CHECK:           %[[VAL_9:.*]] = quake.alloca !quake.veq<?>{{\[}}%[[VAL_8]] : i64]
+// CHECK:           %[[VAL_9:.*]] = quake.alloca !quake.veq<?>[%[[VAL_8]] : i64]
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Quake-QIR/return_values.qke b/test/Quake-QIR/return_values.qke
index a4fbfa7477..085b9fec97 100644
--- a/test/Quake-QIR/return_values.qke
+++ b/test/Quake-QIR/return_values.qke
@@ -6,20 +6,22 @@
 // the terms of the Apache License 2.0 which accompanies this distribution.   //
 // ========================================================================== //
 
-// RUN: cudaq-opt --add-dealloc --kernel-execution=codegen=1 --canonicalize %s | \
-// RUN: cudaq-translate --convert-to=qir | FileCheck %s
+// RUN: cudaq-opt --add-dealloc --kernel-execution=codegen=1 --canonicalize %s | cudaq-translate --convert-to=qir | FileCheck %s
 
 // NB: the mangled name map is required for the kernel-execution pass.
+// QIR codegen requires the target triple.
 module attributes{ quake.mangled_name_map = {
   __nvqpp__mlirgen__test_0 = "test_0",
   __nvqpp__mlirgen__test_1 = "test_1",
   __nvqpp__mlirgen__test_2 = "test_2",
   __nvqpp__mlirgen__test_3 = "test_3",
   __nvqpp__mlirgen__test_4 = "test_4",
-  __nvqpp__mlirgen__test_5 = "test_5" }} {
+  __nvqpp__mlirgen__test_5 = "test_5" },
+  llvm.triple = "x86_64-unknown-linux-gnu"} {
 
 func.func private @__nvqpp_vectorCopyCtor(%arg0: !cc.ptr<i8> , %arg1: i64 , %arg2: i64 ) -> !cc.ptr<i8>
 
+// vector<bool> -> struct ptr sret
 func.func @__nvqpp__mlirgen__test_0(%arg0: i32) -> !cc.stdvec<i1> {
   %c1_i64 = arith.constant 1 : i64
   %c1 = arith.constant 1 : i64
@@ -56,8 +58,8 @@ func.func @test_0(%1: !cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}
   return
 }
 
-// CHECK-LABEL: define void @__nvqpp__mlirgen__test_0({ i8*, i64 }* nocapture writeonly sret({ i8*, i64 }) 
-// CHECK-SAME:           %[[VAL_0:.*]], i32 %[[VAL_1:.*]]) {{.*}}{
+// CHECK-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__test_0(
+// CHECK-SAME:    i32 %[[VAL_1:.*]]) {{.*}}{
 // CHECK:         %[[VAL_2:.*]] = sext i32 %[[VAL_1]] to i64
 // CHECK:         %[[VAL_3:.*]] = tail call %[[VAL_4:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_2]])
 // CHECK:         %[[VAL_5:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_4]]* %[[VAL_3]])
@@ -95,12 +97,11 @@ func.func @test_0(%1: !cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}
 // CHECK:       ._crit_edge5:                                     ; preds = %[[VAL_21]], %[[VAL_8]], %[[VAL_19]]
 // CHECK:         %[[VAL_34:.*]] = phi i8* [ %[[VAL_10]], %[[VAL_8]] ], [ %[[VAL_20]], %[[VAL_19]] ], [ %[[VAL_20]], %[[VAL_21]] ]
 // CHECK:         %[[VAL_35:.*]] = call i8* @__nvqpp_vectorCopyCtor(i8* nonnull %[[VAL_34]], i64 %[[VAL_5]], i64 1)
-// CHECK:         call void @__quantum__rt__qubit_release_array(%[[VAL_4]]* %[[VAL_3]])
-// CHECK:         %[[VAL_36:.*]] = getelementptr inbounds { i8*, i64 }, { i8*, i64 }* %[[VAL_0]], i64 0, i32 0
-// CHECK:         store i8* %[[VAL_35]], i8** %[[VAL_36]], align 8
-// CHECK:         %[[VAL_37:.*]] = getelementptr { i8*, i64 }, { i8*, i64 }* %[[VAL_0]], i64 0, i32 1
-// CHECK:         store i64 %[[VAL_5]], i64* %[[VAL_37]], align 8
-// CHECK:         ret void
+// CHECK:         %[[VAL_36:.*]] = bitcast i8* %[[VAL_35]] to i1*
+// CHECK:         %[[VAL_37:.*]] = insertvalue { i1*, i64 } undef, i1* %[[VAL_36]], 0
+// CHECK:         %[[VAL_38:.*]] = insertvalue { i1*, i64 } %[[VAL_37]], i64 %[[VAL_5]], 1
+// CHECK:         call void @__quantum__rt__qubit_release_array(%Array* %[[VAL_3]])
+// CHECK:         ret { i1*, i64 } %[[VAL_38]]
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_0({ i8*, i8*, i8* }* sret({ i8*, i8*, i8* }) 
@@ -120,6 +121,7 @@ func.func @test_0(%1: !cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}
 // CHECK:         ret void
 // CHECK:       }
 
+// struct{bool, bool} -> i16
 func.func @__nvqpp__mlirgen__test_1() -> !cc.struct<{i1, i1}> {
   %qubits = quake.alloca !quake.veq<2>
   %q0 = quake.extract_ref %qubits[0] : (!quake.veq<2>) -> !quake.ref
@@ -136,12 +138,12 @@ func.func @__nvqpp__mlirgen__test_1() -> !cc.struct<{i1, i1}> {
   return %rv2 : !cc.struct<{i1, i1}>
 }
 
-func.func @test_1(%1: !cc.ptr<!cc.struct<{i1, i1}>> {llvm.sret = !cc.struct<{i1, i1}>}, %this: !cc.ptr<i8>) {
-  return
+func.func @test_1(%this: !cc.ptr<i8>) -> i16 {
+  %0 = cc.undef i16
+  return %0 : i16
 }
 
-// CHECK-LABEL: define void @__nvqpp__mlirgen__test_1({ i1, i1 }* nocapture writeonly sret({ i1, i1 }) 
-// CHECK-SAME:       %[[VAL_0:.*]]) {{.*}}{
+// CHECK-LABEL: define { i1, i1 } @__nvqpp__mlirgen__test_1()
 // CHECK:         %[[VAL_1:.*]] = tail call %[[VAL_2:.*]]* @__quantum__rt__qubit_allocate_array(i64 2)
 // CHECK:         %[[VAL_3:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_2]]* %[[VAL_1]], i64 0)
 // CHECK:         %[[VAL_4:.*]] = bitcast i8* %[[VAL_3]] to %[[VAL_5:.*]]**
@@ -152,37 +154,27 @@ func.func @test_1(%1: !cc.ptr<!cc.struct<{i1, i1}>> {llvm.sret = !cc.struct<{i1,
 // CHECK:         tail call void @__quantum__qis__h(%[[VAL_5]]* %[[VAL_6]])
 // CHECK:         tail call void (i64, void (%[[VAL_2]]*, %[[VAL_5]]*)*, ...) @invokeWithControlQubits(i64 1, void (%[[VAL_2]]*, %[[VAL_5]]*)* nonnull @__quantum__qis__x__ctl, %[[VAL_5]]* %[[VAL_6]], %[[VAL_5]]* %[[VAL_9]])
 // CHECK:         %[[VAL_10:.*]] = tail call %[[VAL_11:.*]]* @__quantum__qis__mz(%[[VAL_5]]* %[[VAL_6]])
-// CHECK:         %[[VAL_12:.*]] = bitcast %[[VAL_11]]* %[[VAL_10]] to i1*
+// CHECK:         %[[VAL_12:.*]] = bitcast %Result* %[[VAL_10]] to i1*
 // CHECK:         %[[VAL_13:.*]] = load i1, i1* %[[VAL_12]], align 1
 // CHECK:         %[[VAL_14:.*]] = tail call %[[VAL_11]]* @__quantum__qis__mz(%[[VAL_5]]* %[[VAL_9]])
-// CHECK:         %[[VAL_15:.*]] = bitcast %[[VAL_11]]* %[[VAL_14]] to i1*
+// CHECK:         %[[VAL_15:.*]] = bitcast %Result* %[[VAL_14]] to i1*
 // CHECK:         %[[VAL_16:.*]] = load i1, i1* %[[VAL_15]], align 1
-// CHECK:         %[[VAL_17:.*]] = getelementptr inbounds { i1, i1 }, { i1, i1 }* %[[VAL_0]], i64 0, i32 0
-// CHECK:         store i1 %[[VAL_13]], i1* %[[VAL_17]], align 1
-// CHECK:         %[[VAL_18:.*]] = getelementptr inbounds { i1, i1 }, { i1, i1 }* %[[VAL_0]], i64 0, i32 1
-// CHECK:         store i1 %[[VAL_16]], i1* %[[VAL_18]], align 1
+// CHECK:         %[[VAL_20:.*]] = insertvalue { i1, i1 } undef, i1 %[[VAL_13]], 0
+// CHECK:         %[[VAL_19:.*]] = insertvalue { i1, i1 } %[[VAL_20]], i1 %[[VAL_16]], 1
 // CHECK:         tail call void @__quantum__rt__qubit_release_array(%[[VAL_2]]* %[[VAL_1]])
-// CHECK:         ret void
+// CHECK:         ret { i1, i1 } %[[VAL_19]]
 // CHECK:       }
 
-// CHECK-LABEL: define void @test_1({ i1, i1 }* nocapture writeonly sret({ i1, i1 }) 
-// CHECK-SAME:      %[[VAL_0:.*]], i8* nocapture readnone %[[VAL_1:.*]]) {{.*}}{
-// CHECK:         %[[VAL_2:.*]] = alloca [2 x i8], align 8
-// CHECK:         %[[VAL_3:.*]] = getelementptr inbounds [2 x i8], [2 x i8]* %[[VAL_2]], i64 0, i64 0
+// CHECK-LABEL: define i16 @test_1(i8* nocapture readnone
+// CHECK-SAME:    %[[VAL_0:.*]]) {{.*}}{
+// CHECK-NEXT:    %[[VAL_2:.*]] = alloca i16, align 8
+// CHECK:         %[[VAL_3:.*]] = bitcast i16* %[[VAL_2]] to i8*
 // CHECK:         call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_1.thunk to i8*), i8* nonnull %[[VAL_3]], i64 2, i64 0)
-// CHECK:         %[[VAL_4:.*]] = bitcast [2 x i8]* %[[VAL_2]] to i1*
-// CHECK:         %[[VAL_5:.*]] = load i1, i1* %[[VAL_4]], align 8
-// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds [2 x i8], [2 x i8]* %[[VAL_2]], i64 0, i64 1
-// CHECK:         %[[VAL_7:.*]] = bitcast i8* %[[VAL_6]] to i1*
-// CHECK:         %[[VAL_8:.*]] = load i1, i1* %[[VAL_7]], align 1
-// CHECK:         %[[VAL_9:.*]] = getelementptr inbounds { i1, i1 }, { i1, i1 }* %[[VAL_0]], i64 0, i32 0
-// CHECK:         store i1 %[[VAL_5]], i1* %[[VAL_9]], align 1
-// CHECK:         %[[VAL_10:.*]] = getelementptr inbounds { i1, i1 }, { i1, i1 }* %[[VAL_0]], i64 0, i32 1
-// CHECK:         store i1 %[[VAL_8]], i1* %[[VAL_10]], align 1
-// CHECK:         ret void
+// CHECK:         %[[VAL_4:.*]] = load i16, i16* %[[VAL_2]], align 8
+// CHECK:         ret i16 %[[VAL_4]]
 // CHECK:       }
 
-
+// struct{i16, f32, f64, i64} -> sret ptr
 func.func @__nvqpp__mlirgen__test_2() -> !cc.struct<{i16, f32, f64, i64}> {
   %rv = cc.undef !cc.struct<{i16, f32, f64, i64}>
   %c1 = arith.constant 8 : i16
@@ -200,10 +192,8 @@ func.func @test_2(%1: !cc.ptr<!cc.struct<{i16, f32, f64, i64}>> {llvm.sret = !cc
   return
 }
 
-// CHECK-LABEL: define void @__nvqpp__mlirgen__test_2({ i16, float, double, i64 }* nocapture writeonly sret({ i16, float, double, i64 }) 
-// CHECK-SAME:        %[[VAL_0:.*]]) {{.*}}{
-// CHECK:         store { i16, float, double, i64 } { i16 8, float 0x40159999A0000000, double 3.783000e+01, i64 1479 }, { i16, float, double, i64 }* %[[VAL_0]], align 8
-// CHECK:         ret void
+// CHECK-LABEL: define { i16, float, double, i64 } @__nvqpp__mlirgen__test_2()
+// CHECK:         ret { i16, float, double, i64 } { i16 8, float 0x40159999A0000000, double 3.783000e+01, i64 1479 }
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_2({ i16, float, double, i64 }* nocapture writeonly sret({ i16, float, double, i64 }) 
@@ -211,22 +201,12 @@ func.func @test_2(%1: !cc.ptr<!cc.struct<{i16, f32, f64, i64}>> {llvm.sret = !cc
 // CHECK:         %[[VAL_2:.*]] = alloca { { i16, float, double, i64 } }, align 8
 // CHECK:         %[[VAL_3:.*]] = bitcast { { i16, float, double, i64 } }* %[[VAL_2]] to i8*
 // CHECK:         call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_2.thunk to i8*), i8* nonnull %[[VAL_3]], i64 24, i64 0)
-// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds { { i16, float, double, i64 } }, { { i16, float, double, i64 } }* %[[VAL_2]], i64 0, i32 0, i32 0
-// CHECK:         %[[VAL_5:.*]] = load i16, i16* %[[VAL_4]], align 8
-// CHECK:         %[[VAL_6:.*]] = insertvalue { i16, float, double, i64 } poison, i16 %[[VAL_5]], 0
-// CHECK:         %[[VAL_7:.*]] = getelementptr inbounds { { i16, float, double, i64 } }, { { i16, float, double, i64 } }* %[[VAL_2]], i64 0, i32 0, i32 1
-// CHECK:         %[[VAL_8:.*]] = load float, float* %[[VAL_7]], align 4
-// CHECK:         %[[VAL_9:.*]] = insertvalue { i16, float, double, i64 } %[[VAL_6]], float %[[VAL_8]], 1
-// CHECK:         %[[VAL_10:.*]] = getelementptr inbounds { { i16, float, double, i64 } }, { { i16, float, double, i64 } }* %[[VAL_2]], i64 0, i32 0, i32 2
-// CHECK:         %[[VAL_11:.*]] = load double, double* %[[VAL_10]], align 8
-// CHECK:         %[[VAL_12:.*]] = insertvalue { i16, float, double, i64 } %[[VAL_9]], double %[[VAL_11]], 2
-// CHECK:         %[[VAL_13:.*]] = getelementptr inbounds { { i16, float, double, i64 } }, { { i16, float, double, i64 } }* %[[VAL_2]], i64 0, i32 0, i32 3
-// CHECK:         %[[VAL_14:.*]] = load i64, i64* %[[VAL_13]], align 8
-// CHECK:         %[[VAL_15:.*]] = insertvalue { i16, float, double, i64 } %[[VAL_12]], i64 %[[VAL_14]], 3
-// CHECK:         store { i16, float, double, i64 } %[[VAL_15]], { i16, float, double, i64 }* %[[VAL_0]], align 8
+// CHECK:         %[[VAL_4:.*]] = bitcast { i16, float, double, i64 }* %[[VAL_0]] to i8*
+// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(24) %[[VAL_4]], i8* noundef nonnull align 8 dereferenceable(24) %[[VAL_3]], i64 24, i1 false)
 // CHECK:         ret void
 // CHECK:       }
 
+// array<T x n> -> sret ptr
 func.func @__nvqpp__mlirgen__test_3() -> !cc.array<i64 x 5> {
   %rv = cc.undef !cc.array<i64 x 5>
   %c1 = arith.constant 5 : i64
@@ -246,19 +226,8 @@ func.func @test_3(%1: !cc.ptr<!cc.array<i64 x 5>> {llvm.sret = !cc.array<i64 x 5
   return
 }
 
-// CHECK-LABEL: define void @__nvqpp__mlirgen__test_3([5 x i64]* nocapture writeonly sret([5 x i64]) 
-// CHECK-SAME:        %[[VAL_0:.*]]) {{.*}}{
-// CHECK:         %[[VAL_1:.*]] = getelementptr inbounds [5 x i64], [5 x i64]* %[[VAL_0]], i64 0, i64 0
-// CHECK:         store i64 5, i64* %[[VAL_1]], align 8
-// CHECK:         %[[VAL_2:.*]] = getelementptr inbounds [5 x i64], [5 x i64]* %[[VAL_0]], i64 0, i64 1
-// CHECK:         store i64 74, i64* %[[VAL_2]], align 8
-// CHECK:         %[[VAL_3:.*]] = getelementptr inbounds [5 x i64], [5 x i64]* %[[VAL_0]], i64 0, i64 2
-// CHECK:         store i64 299, i64* %[[VAL_3]], align 8
-// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds [5 x i64], [5 x i64]* %[[VAL_0]], i64 0, i64 3
-// CHECK:         store i64 1659, i64* %[[VAL_4]], align 8
-// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds [5 x i64], [5 x i64]* %[[VAL_0]], i64 0, i64 4
-// CHECK:         store i64 61234, i64* %[[VAL_5]], align 8
-// CHECK:         ret void
+// CHECK-LABEL: define [5 x i64] @__nvqpp__mlirgen__test_3(
+// CHECK:         ret [5 x i64] [i64 5, i64 74, i64 299, i64 1659, i64 61234]
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_3([5 x i64]* nocapture writeonly sret([5 x i64]) 
@@ -266,46 +235,24 @@ func.func @test_3(%1: !cc.ptr<!cc.array<i64 x 5>> {llvm.sret = !cc.array<i64 x 5
 // CHECK:         %[[VAL_2:.*]] = alloca { [5 x i64] }, align 8
 // CHECK:         %[[VAL_3:.*]] = bitcast { [5 x i64] }* %[[VAL_2]] to i8*
 // CHECK:         call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_3.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_3.thunk to i8*), i8* nonnull %[[VAL_3]], i64 40, i64 0)
-// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds { [5 x i64] }, { [5 x i64] }* %[[VAL_2]], i64 0, i32 0, i64 0
-// CHECK:         %[[VAL_5:.*]] = load i64, i64* %[[VAL_4]], align 8
-// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds { [5 x i64] }, { [5 x i64] }* %[[VAL_2]], i64 0, i32 0, i64 1
-// CHECK:         %[[VAL_7:.*]] = load i64, i64* %[[VAL_6]], align 8
-// CHECK:         %[[VAL_8:.*]] = getelementptr inbounds { [5 x i64] }, { [5 x i64] }* %[[VAL_2]], i64 0, i32 0, i64 2
-// CHECK:         %[[VAL_9:.*]] = load i64, i64* %[[VAL_8]], align 8
-// CHECK:         %[[VAL_10:.*]] = getelementptr inbounds { [5 x i64] }, { [5 x i64] }* %[[VAL_2]], i64 0, i32 0, i64 3
-// CHECK:         %[[VAL_11:.*]] = load i64, i64* %[[VAL_10]], align 8
-// CHECK:         %[[VAL_12:.*]] = getelementptr inbounds { [5 x i64] }, { [5 x i64] }* %[[VAL_2]], i64 0, i32 0, i64 4
-// CHECK:         %[[VAL_13:.*]] = load i64, i64* %[[VAL_12]], align 8
-// CHECK:         %[[VAL_14:.*]] = getelementptr inbounds [5 x i64], [5 x i64]* %[[VAL_0]], i64 0, i64 0
-// CHECK:         store i64 %[[VAL_5]], i64* %[[VAL_14]], align 8
-// CHECK:         %[[VAL_15:.*]] = getelementptr inbounds [5 x i64], [5 x i64]* %[[VAL_0]], i64 0, i64 1
-// CHECK:         store i64 %[[VAL_7]], i64* %[[VAL_15]], align 8
-// CHECK:         %[[VAL_16:.*]] = getelementptr inbounds [5 x i64], [5 x i64]* %[[VAL_0]], i64 0, i64 2
-// CHECK:         store i64 %[[VAL_9]], i64* %[[VAL_16]], align 8
-// CHECK:         %[[VAL_17:.*]] = getelementptr inbounds [5 x i64], [5 x i64]* %[[VAL_0]], i64 0, i64 3
-// CHECK:         store i64 %[[VAL_11]], i64* %[[VAL_17]], align 8
-// CHECK:         %[[VAL_18:.*]] = getelementptr inbounds [5 x i64], [5 x i64]* %[[VAL_0]], i64 0, i64 4
-// CHECK:         store i64 %[[VAL_13]], i64* %[[VAL_18]], align 8
+// CHECK:         %[[VAL_4:.*]] = bitcast [5 x i64]* %[[VAL_0]] to i8*
+// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(40) %[[VAL_4]], i8* noundef nonnull align 8 dereferenceable(40) %[[VAL_3]], i64 40, i1 false)
 // CHECK:         ret void
 // CHECK:       }
 
+// small struct (<= 128) -> { i64, f64 }
 func.func @__nvqpp__mlirgen__test_4() -> (i64, f64) {
   %c1 = arith.constant 537892 : i64
   %c2 = arith.constant 94.2134 : f64
   return %c1, %c2 : i64, f64
 }
 
-func.func @test_4(%1: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct<{i64, f64}>}, %this: !cc.ptr<i8>) {
+func.func @test_4(%sret: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct<{i64, f64}>}, %this: !cc.ptr<i8>) {
   return
 }
 
-// CHECK-LABEL: define void @__nvqpp__mlirgen__test_4({ i64, double }* nocapture writeonly sret({ i64, double }) 
-// CHECK-SAME:     %[[VAL_0:.*]]) {{.*}}{
-// CHECK:         %[[VAL_1:.*]] = getelementptr inbounds { i64, double }, { i64, double }* %[[VAL_0]], i64 0, i32 0
-// CHECK:         store i64 537892, i64* %[[VAL_1]], align 8
-// CHECK:         %[[VAL_2:.*]] = getelementptr { i64, double }, { i64, double }* %[[VAL_0]], i64 0, i32 1
-// CHECK:         store double 0x40578DA858793DD9, double* %[[VAL_2]], align 8
-// CHECK:         ret void
+// CHECK-LABEL: define { i64, double } @__nvqpp__mlirgen__test_4() {{.*}}{
+// CHECK:         ret { i64, double } { i64 537892, double 0x40578DA858793DD9 }
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_4({ i64, double }* nocapture writeonly sret({ i64, double }) 
@@ -313,14 +260,8 @@ func.func @test_4(%1: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct<{i
 // CHECK:         %[[VAL_2:.*]] = alloca { i64, double }, align 8
 // CHECK:         %[[VAL_3:.*]] = bitcast { i64, double }* %[[VAL_2]] to i8*
 // CHECK:         call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_4.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_4.thunk to i8*), i8* nonnull %[[VAL_3]], i64 16, i64 0)
-// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds { i64, double }, { i64, double }* %[[VAL_2]], i64 0, i32 0
-// CHECK:         %[[VAL_5:.*]] = load i64, i64* %[[VAL_4]], align 8
-// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds { i64, double }, { i64, double }* %[[VAL_0]], i64 0, i32 0
-// CHECK:         store i64 %[[VAL_5]], i64* %[[VAL_6]], align 8
-// CHECK:         %[[VAL_7:.*]] = getelementptr inbounds { i64, double }, { i64, double }* %[[VAL_2]], i64 0, i32 1
-// CHECK:         %[[VAL_8:.*]] = load double, double* %[[VAL_7]], align 8
-// CHECK:         %[[VAL_9:.*]] = getelementptr { i64, double }, { i64, double }* %[[VAL_0]], i64 0, i32 1
-// CHECK:         store double %[[VAL_8]], double* %[[VAL_9]], align 8
+// CHECK:         %[[VAL_4:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8*
+// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_4]], i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_3]], i64 16, i1 false)
 // CHECK:         ret void
 // CHECK:       }
 
@@ -330,17 +271,12 @@ func.func @__nvqpp__mlirgen__test_5() -> (i64, f64) attributes {no_this} {
   return %c1, %c2 : i64, f64
 }
 
-func.func @test_5(%0: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct<{i64, f64}>}) {
+func.func @test_5(%sret: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct<{i64, f64}>}) {
   return
 }
 
-// CHECK-LABEL: define void @__nvqpp__mlirgen__test_5({ i64, double }* nocapture writeonly sret({ i64, double }) 
-// CHECK-SAME:       %[[VAL_0:.*]]) {{.*}}{
-// CHECK:         %[[VAL_1:.*]] = getelementptr inbounds { i64, double }, { i64, double }* %[[VAL_0]], i64 0, i32 0
-// CHECK:         store i64 537892, i64* %[[VAL_1]], align 8
-// CHECK:         %[[VAL_2:.*]] = getelementptr { i64, double }, { i64, double }* %[[VAL_0]], i64 0, i32 1
-// CHECK:         store double 0x40578DA858793DD9, double* %[[VAL_2]], align 8
-// CHECK:         ret void
+// CHECK-LABEL: define { i64, double } @__nvqpp__mlirgen__test_5() {{.*}}{
+// CHECK:         ret { i64, double } { i64 537892, double 0x40578DA858793DD9 }
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_5({ i64, double }* nocapture writeonly sret({ i64, double }) 
@@ -348,14 +284,8 @@ func.func @test_5(%0: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct<{i
 // CHECK:         %[[VAL_1:.*]] = alloca { i64, double }, align 8
 // CHECK:         %[[VAL_2:.*]] = bitcast { i64, double }* %[[VAL_1]] to i8*
 // CHECK:         call void @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_5.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_5.thunk to i8*), i8* nonnull %[[VAL_2]], i64 16, i64 0)
-// CHECK:         %[[VAL_3:.*]] = getelementptr inbounds { i64, double }, { i64, double }* %[[VAL_1]], i64 0, i32 0
-// CHECK:         %[[VAL_4:.*]] = load i64, i64* %[[VAL_3]], align 8
-// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds { i64, double }, { i64, double }* %[[VAL_0]], i64 0, i32 0
-// CHECK:         store i64 %[[VAL_4]], i64* %[[VAL_5]], align 8
-// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds { i64, double }, { i64, double }* %[[VAL_1]], i64 0, i32 1
-// CHECK:         %[[VAL_7:.*]] = load double, double* %[[VAL_6]], align 8
-// CHECK:         %[[VAL_8:.*]] = getelementptr { i64, double }, { i64, double }* %[[VAL_0]], i64 0, i32 1
-// CHECK:         store double %[[VAL_7]], double* %[[VAL_8]], align 8
+// CHECK:         %[[VAL_3:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8*
+// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_3]], i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_2]], i64 16, i1 false)
 // CHECK:         ret void
 // CHECK:       }
 
@@ -371,7 +301,6 @@ func.func @test_5(%0: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct<{i
 // CHECK-SAME:     %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) {
 // CHECK:         %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to i32*
 // CHECK:         %[[VAL_3:.*]] = load i32, i32* %[[VAL_2]], align 4
-// CHECK:         %[[VAL_4:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8
 // CHECK:         %[[VAL_5:.*]] = sext i32 %[[VAL_3]] to i64
 // CHECK:         %[[VAL_6:.*]] = tail call %[[VAL_7:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_5]])
 // CHECK:         %[[VAL_8:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_7]]* %[[VAL_6]])
@@ -392,7 +321,7 @@ func.func @test_5(%0: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct<{i
 // CHECK:       ._crit_edge:                                      ; preds = %[[VAL_10]]
 // CHECK:         %[[VAL_23:.*]] = alloca i8, i64 %[[VAL_8]], align 1
 // CHECK:         br i1 %[[VAL_9]], label %[[VAL_24:.*]], label %[[VAL_14]]
-// CHECK:       .lr.ph4:                                          ; preds = %[[VAL_22]], %[[VAL_24]]
+// CHECK:       [[VAL_24]]:                                          ; preds = %[[VAL_22]], %[[VAL_24]]
 // CHECK:         %[[VAL_25:.*]] = phi i64 [ %[[VAL_26:.*]], %[[VAL_24]] ], [ 0, %[[VAL_22]] ]
 // CHECK:         %[[VAL_27:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_7]]* %[[VAL_6]], i64 %[[VAL_25]])
 // CHECK:         %[[VAL_28:.*]] = bitcast i8* %[[VAL_27]] to %[[VAL_19]]**
@@ -406,20 +335,21 @@ func.func @test_5(%0: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct<{i
 // CHECK:         %[[VAL_26]] = add nuw nsw i64 %[[VAL_25]], 1
 // CHECK:         %[[VAL_36:.*]] = icmp eq i64 %[[VAL_26]], %[[VAL_8]]
 // CHECK:         br i1 %[[VAL_36]], label %[[VAL_14]], label %[[VAL_24]]
-// CHECK:       ._crit_edge5:                                     ; preds = %[[VAL_24]], %[[VAL_11]], %[[VAL_22]]
+// CHECK:       [[VAL_14]]:                                     ; preds = %[[VAL_24]], %[[VAL_11]], %[[VAL_22]]
 // CHECK:         %[[VAL_37:.*]] = phi i8* [ %[[VAL_13]], %[[VAL_11]] ], [ %[[VAL_23]], %[[VAL_22]] ], [ %[[VAL_23]], %[[VAL_24]] ]
 // CHECK:         %[[VAL_38:.*]] = call i8* @__nvqpp_vectorCopyCtor(i8* nonnull %[[VAL_37]], i64 %[[VAL_8]], i64 1)
 // CHECK:         call void @__quantum__rt__qubit_release_array(%[[VAL_7]]* %[[VAL_6]])
-// CHECK:         %[[VAL_39:.*]] = bitcast i8* %[[VAL_4]] to i8**
-// CHECK:         store i8* %[[VAL_38]], i8** %[[VAL_39]], align 8
-// CHECK:         %[[VAL_40:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 16
-// CHECK:         %[[VAL_41:.*]] = bitcast i8* %[[VAL_40]] to i64*
-// CHECK:         store i64 %[[VAL_8]], i64* %[[VAL_41]], align 4
+// CHECK:         %[[VAL_50:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8
+// CHECK:         %[[VAL_51:.*]] = bitcast i8* %[[VAL_50]] to i8**
+// CHECK:         store i8* %[[VAL_38]], i8** %[[VAL_51]], align 8
+// CHECK:         %[[VAL_52:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 16
+// CHECK:         %[[VAL_53:.*]] = bitcast i8* %[[VAL_52]] to i64*
+// CHECK:         store i64 %[[VAL_8]], i64* %[[VAL_53]], align 8
 // CHECK:         br i1 %[[VAL_1]], label %[[VAL_42:.*]], label %[[VAL_43:.*]]
-// CHECK:       common.ret:                                       ; preds = %[[VAL_14]], %[[VAL_42]]
+// CHECK:       [[VAL_43]]:                                       ; preds = %[[VAL_14]], %[[VAL_42]]
 // CHECK:         %[[VAL_44:.*]] = phi { i8*, i64 } [ %[[VAL_45:.*]], %[[VAL_42]] ], [ zeroinitializer, %[[VAL_14]] ]
 // CHECK:         ret { i8*, i64 } %[[VAL_44]]
-// CHECK:       32:                                               ; preds = %[[VAL_14]]
+// CHECK:       [[VAL_42]]:                                               ; preds = %[[VAL_14]]
 // CHECK:         %[[VAL_46:.*]] = add i64 %[[VAL_8]], 24
 // CHECK:         %[[VAL_47:.*]] = call i8* @malloc(i64 %[[VAL_46]])
 // CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 1 dereferenceable(24) %[[VAL_47]], i8* noundef nonnull align 1 dereferenceable(24) %[[VAL_0]], i64 24, i1 false)
diff --git a/test/Quake/return_vector.qke b/test/Quake/return_vector.qke
index 23a718bcc5..a13d0b6abe 100644
--- a/test/Quake/return_vector.qke
+++ b/test/Quake/return_vector.qke
@@ -29,16 +29,12 @@ func.func @test_0(%0: !cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i3
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__test_0(
-// CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>> {llvm.sret = !cc.struct<{!cc.ptr<i8>, i64}>}, %[[VAL_1:.*]]: i32) {
+// CHECK-SAME:      %[[VAL_1:.*]]: i32) -> !cc.stdvec<i32> {
 // CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 8 : i64
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 256 : i64
 // CHECK:           %[[VAL_4:.*]] = call @malloc(%[[VAL_3]]) : (i64) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<i8>) -> !cc.ptr<i32>
-// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<!cc.ptr<i32>>
-// CHECK:           cc.store %[[VAL_5]], %[[VAL_9]] : !cc.ptr<!cc.ptr<i32>>
-// CHECK:           %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_0]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<i64>
-// CHECK:           cc.store %[[VAL_2]], %[[VAL_7]] : !cc.ptr<i64>
-// CHECK:           return
+// CHECK:           %[[VAL_5:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_2]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<i32>
+// CHECK:           return %[[VAL_5]] : !cc.stdvec<i32>
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @test_0(
@@ -93,15 +89,11 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__test_1(
-// CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>> {llvm.sret = !cc.struct<{!cc.ptr<i8>, i64}>}, %[[VAL_1:.*]]: i32) {
+// CHECK-SAME:      %[[VAL_1:.*]]: i32) -> !cc.stdvec<f64> {
 // CHECK:           %[[VAL_2:.*]] = arith.constant 9 : i64
 // CHECK:           %[[VAL_3:.*]] = arith.constant 520 : i64
 // CHECK:           %[[VAL_4:.*]] = call @malloc(%[[VAL_3]]) : (i64) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<i8>) -> !cc.ptr<f64>
-// CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<!cc.ptr<f64>>
-// CHECK:           cc.store %[[VAL_5]], %[[VAL_8]] : !cc.ptr<!cc.ptr<f64>>
-// CHECK:           %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_0]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<i64>
-// CHECK:           cc.store %[[VAL_2]], %[[VAL_7]] : !cc.ptr<i64>
+// CHECK:           %[[VAL_5:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_2]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<f64>
 // CHECK:           return
 // CHECK:         }
 
@@ -151,13 +143,13 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 // CHECK:           %[[VAL_3:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
 // CHECK:           %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
 // CHECK:           %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> : i64
-// CHECK:           %[[VAL_8:.*]] = cc.compute_ptr %[[VAL_3]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>
-// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>
 // CHECK:           %[[VAL_10:.*]] = cc.extract_value %[[VAL_4]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>) -> i32
-// CHECK:           call @__nvqpp__mlirgen__test_0(%[[VAL_9]], %[[VAL_10]]) : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>, i32) -> ()
+// CHECK:           %[[VAL_15:.*]] = call @__nvqpp__mlirgen__test_0(%[[VAL_10]]) : (i32) -> !cc.stdvec<i32>
 // CHECK:           cf.cond_br %[[VAL_1]], ^bb1, ^bb2
 // CHECK:         ^bb1:
 // CHECK:           %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_3]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>
+// CHECK:           %[[VAL_16:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>) -> !cc.ptr<!cc.stdvec<i32>>
+// CHECK:           cc.store %[[VAL_15]], %[[VAL_16]] : !cc.ptr<!cc.stdvec<i32>>
 // CHECK:           %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>
 // CHECK:           %[[VAL_13:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_7]], %[[VAL_12]]) : (!cc.ptr<i8>, i64, !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:           return %[[VAL_13]] : !cc.struct<{!cc.ptr<i8>, i64}>
@@ -171,13 +163,13 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 // CHECK:           %[[VAL_3:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
 // CHECK:           %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
 // CHECK:           %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> : i64
-// CHECK:           %[[VAL_8:.*]] = cc.compute_ptr %[[VAL_3]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>
-// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>
 // CHECK:           %[[VAL_10:.*]] = cc.extract_value %[[VAL_4]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>) -> i32
-// CHECK:           call @__nvqpp__mlirgen__test_1(%[[VAL_9]], %[[VAL_10]]) : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>, i32) -> ()
+// CHECK:           %[[VAL_15:.*]] = call @__nvqpp__mlirgen__test_1(%[[VAL_10]]) : (i32) -> !cc.stdvec<f64>
 // CHECK:           cf.cond_br %[[VAL_1]], ^bb1, ^bb2
 // CHECK:         ^bb1:
 // CHECK:           %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_3]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>
+// CHECK:           %[[VAL_16:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>) -> !cc.ptr<!cc.stdvec<f64>>
+// CHECK:           cc.store %[[VAL_15]], %[[VAL_16]] : !cc.ptr<!cc.stdvec<f64>>
 // CHECK:           %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>
 // CHECK:           %[[VAL_13:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_7]], %[[VAL_12]]) : (!cc.ptr<i8>, i64, !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:           return %[[VAL_13]] : !cc.struct<{!cc.ptr<i8>, i64}>