From e8ebb387facc67e5cf8903458eeb44c2d44d7159 Mon Sep 17 00:00:00 2001
From: Eric Schweitz <eschweitz@nvidia.com>
Date: Tue, 15 Oct 2024 11:35:00 -0700
Subject: [PATCH 1/2] Start changing core code over to use SmallVector (and be
 more compatible with MLIR) instead of std::vector.  Refactoring lifting of
 array values.  Implement globalize array values. Update new pass and pass
 names across all the config and yaml files.

Signed-off-by: Eric Schweitz <eschweitz@nvidia.com>
---
 include/cudaq/Optimizer/Builder/Intrinsics.h  |  59 ++--
 include/cudaq/Optimizer/Transforms/Passes.td  |  28 +-
 lib/Optimizer/Builder/Intrinsics.cpp          |  66 +++--
 lib/Optimizer/CodeGen/Pipelines.cpp           |   3 +-
 lib/Optimizer/Transforms/CMakeLists.txt       |   1 +
 lib/Optimizer/Transforms/ConstPropComplex.cpp |  39 +--
 .../Transforms/GlobalizeArrayValues.cpp       | 138 +++++++++
 lib/Optimizer/Transforms/LiftArrayAlloc.cpp   | 212 ++++----------
 lib/Optimizer/Transforms/QuakeSynthesizer.cpp |  78 +++--
 .../cudaq/platform/py_alt_launch_kernel.cpp   |  19 +-
 python/runtime/mlir/py_register_dialects.cpp  |  17 +-
 runtime/common/ArgumentConversion.cpp         |   2 +-
 runtime/common/BaseRemoteRESTQPU.h            |   2 +-
 runtime/cudaq/platform/default/opt-test.yml   |   6 +-
 .../default/rest/helpers/anyon/anyon.yml      |   2 +-
 .../default/rest/helpers/ionq/ionq.yml        |   2 +-
 .../platform/default/rest/helpers/iqm/iqm.yml |   2 +-
 .../platform/default/rest/helpers/oqc/oqc.yml |   2 +-
 .../rest/helpers/quantinuum/quantinuum.yml    |   2 +-
 .../Remote-Sim/qvector_init_from_vector.cpp   | 274 ++++++++++--------
 .../RegressionValidation/anyon.config         |   4 +-
 .../RegressionValidation/ionq.config          |   2 +-
 .../RegressionValidation/iqm.config           |   2 +-
 .../RegressionValidation/oqc.config           |   2 +-
 .../RegressionValidation/quantinuum.config    |   2 +-
 test/AST-Quake/custom_op_concrete_matrix.cpp  |  10 +-
 test/Quake/lift_array.qke                     |  63 +++-
 tools/nvqpp/nvq++.in                          |   2 +-
 28 files changed, 586 insertions(+), 455 deletions(-)
 create mode 100644 lib/Optimizer/Transforms/GlobalizeArrayValues.cpp
diff --git a/include/cudaq/Optimizer/Builder/Intrinsics.h b/include/cudaq/Optimizer/Builder/Intrinsics.h
index 30ab0e696a..fa9ce53097 100644
--- a/include/cudaq/Optimizer/Builder/Intrinsics.h
+++ b/include/cudaq/Optimizer/Builder/Intrinsics.h
@@ -94,41 +94,48 @@ class IRBuilder : public mlir::OpBuilder {
     return genCStringLiteral(loc, module, buffer);
   }
 
+  cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
+                                    llvm::StringRef name,
+                                    mlir::DenseElementsAttr values,
+                                    mlir::Type elementType);
+  cc::GlobalOp genVectorOfConstants(
+      mlir::Location loc, mlir::ModuleOp module, llvm::StringRef name,
+      const llvm::SmallVectorImpl<std::complex<double>> &values);
+  cc::GlobalOp genVectorOfConstants(
+      mlir::Location loc, mlir::ModuleOp module, llvm::StringRef name,
+      const llvm::SmallVectorImpl<std::complex<float>> &values);
+
   cc::GlobalOp
   genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
-                       mlir::StringRef name,
-                       const std::vector<std::complex<double>> &values);
-  cc::GlobalOp
-  genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
-                       mlir::StringRef name,
-                       const std::vector<std::complex<float>> &values);
-
+                       llvm::StringRef name,
+                       const llvm::SmallVectorImpl<double> &values);
   cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
-                                    mlir::StringRef name,
-                                    const std::vector<double> &values);
-  cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
-                                    mlir::StringRef name,
-                                    const std::vector<float> &values);
+                                    llvm::StringRef name,
+                                    const llvm::SmallVectorImpl<float> &values);
 
-  cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
-                                    mlir::StringRef name,
-                                    const std::vector<std::int64_t> &values);
+  cc::GlobalOp
+  genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
+                       llvm::StringRef name,
+                       const llvm::SmallVectorImpl<std::int64_t> &values);
 
-  cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
-                                    mlir::StringRef name,
-                                    const std::vector<std::int32_t> &values);
+  cc::GlobalOp
+  genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
+                       llvm::StringRef name,
+                       const llvm::SmallVectorImpl<std::int32_t> &values);
 
-  cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
-                                    mlir::StringRef name,
-                                    const std::vector<std::int16_t> &values);
+  cc::GlobalOp
+  genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
+                       llvm::StringRef name,
+                       const llvm::SmallVectorImpl<std::int16_t> &values);
 
-  cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
-                                    mlir::StringRef name,
-                                    const std::vector<std::int8_t> &values);
+  cc::GlobalOp
+  genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
+                       llvm::StringRef name,
+                       const llvm::SmallVectorImpl<std::int8_t> &values);
 
   cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
-                                    mlir::StringRef name,
-                                    const std::vector<bool> &values);
+                                    llvm::StringRef name,
+                                    const llvm::SmallVectorImpl<bool> &values);
 
   /// Load an intrinsic into \p module. The intrinsic to load has name \p name.
   /// This will automatically load any intrinsics that \p name depends upon.
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 9ca3810f39..da6f3163b3 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -160,7 +160,7 @@ def CombineQuantumAllocations :
   let dependentDialects = ["cudaq::cc::CCDialect", "quake::QuakeDialect"];
 }
 
-def ConstPropComplex : Pass<"const-prop-complex", "mlir::ModuleOp"> {
+def ConstPropComplex : Pass<"const-prop-complex", "mlir::func::FuncOp"> {
   let summary = "Create and propagate complex constants.";
   let description = [{
     Rewrite the complex.CreateOp to complex.ConstantOp when possible.
@@ -383,11 +383,11 @@ def GenerateKernelExecution : Pass<"kernel-execution", "mlir::ModuleOp"> {
 
 def GetConcreteMatrix : Pass<"get-concrete-matrix", "mlir::func::FuncOp"> {
   let summary =
-    "Replace the unitary matrix generator function with concrete matrix.";
+    "Replace the unitary matrix generator function with a constant matrix.";
   let description = [{
     Given a custom operation whose generator attribute is another function 
     within the module, such that if `LiftArrayAlloc` pass has run, there will
-    be a global constant within the module which holds the concrete matrix 
+    be a global constant within the module which holds the constant matrix 
     representation for the custom operation. This pass will find that global
     variable and update the custom operation to directly point to it. 
 
@@ -424,6 +424,22 @@ def GetConcreteMatrix : Pass<"get-concrete-matrix", "mlir::func::FuncOp"> {
   }];
 }
 
+// GlobalizeArrayValues must be a module pass because it may promoted array
+// constants from functions to global constants (changes their scope).
+def GlobalizeArrayValues : Pass<"globalize-array-values", "mlir::ModuleOp"> {
+  let summary = "Convert const_array ops to globals.";
+  let description = [{
+    Often a `const_array` op can be canonicalized into scalar constants that
+    are then constant propagated to their uses in the quake ops. When this
+    happens, the `const_array` may become unused and be eliminated.
+
+    However, there can also be cases where the `const_array` remains alive, such
+    as when it is used in a `state_init` op. In such cases, we may be able to go
+    ahead and replace the `const_array` with a global constant. This pass makes
+    such conversions.
+  }];
+}
+
 // LambdaLifting is a module pass because it may modify the ModuleOp and add
 // new FuncOps.
 def LambdaLifting : Pass<"lambda-lifting", "mlir::ModuleOp"> {
@@ -439,7 +455,7 @@ def LambdaLifting : Pass<"lambda-lifting", "mlir::ModuleOp"> {
   let constructor = "cudaq::opt::createLambdaLiftingPass()";
 }
 
-def LiftArrayAlloc : Pass<"lift-array-value", "mlir::ModuleOp"> {
+def LiftArrayAlloc : Pass<"lift-array-alloc", "mlir::func::FuncOp"> {
   let summary = "Convert constant arrays built on the stack to array values";
   let description = [{
     The bridge or other passes may generate inline code to build an array of
@@ -476,6 +492,10 @@ def LiftArrayAlloc : Pass<"lift-array-value", "mlir::ModuleOp"> {
     updated or escapes the function, it cannot be replaced by a value. If
     it is elements are accessed in a read-only way, it is a legal transform
     and will enable further constant folding in other passes.
+
+    See the globalize array values pass for converting `const_array` values
+    to global constants. Conversion to globals is intentionally deferred to
+    allow constant propagation to take place correctly.
   }];
 
   let dependentDialects = ["mlir::complex::ComplexDialect"];
diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
index a6cc0ae477..315743f057 100644
--- a/lib/Optimizer/Builder/Intrinsics.cpp
+++ b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -450,16 +450,16 @@ LogicalResult IRBuilder::loadIntrinsic(ModuleOp module, StringRef intrinName) {
 }
 
 template <typename T>
-DenseElementsAttr createDenseElementsAttr(const std::vector<T> &values,
+DenseElementsAttr createDenseElementsAttr(const SmallVectorImpl<T> &values,
                                           Type eleTy) {
   auto newValues = ArrayRef<T>(values.data(), values.size());
   auto tensorTy = RankedTensorType::get(values.size(), eleTy);
   return DenseElementsAttr::get(tensorTy, newValues);
 }
 
-DenseElementsAttr createDenseElementsAttr(const std::vector<bool> &values,
-                                          Type eleTy) {
-  std::vector<std::byte> converted;
+static DenseElementsAttr
+createDenseElementsAttr(const SmallVectorImpl<bool> &values, Type eleTy) {
+  SmallVector<std::byte> converted;
   for (auto it = values.begin(); it != values.end(); it++) {
     bool value = *it;
     converted.push_back(std::byte(value));
@@ -470,83 +470,99 @@ DenseElementsAttr createDenseElementsAttr(const std::vector<bool> &values,
   return DenseElementsAttr::get(tensorTy, newValues);
 }
 
-template <typename A>
-cc::GlobalOp buildVectorOfConstantElements(Location loc, ModuleOp module,
-                                           StringRef name,
-                                           const std::vector<A> &values,
-                                           IRBuilder &builder, Type eleTy) {
+static cc::GlobalOp buildVectorOfConstantElements(Location loc, ModuleOp module,
+                                                  StringRef name,
+                                                  DenseElementsAttr &arrayAttr,
+                                                  IRBuilder &builder,
+                                                  Type eleTy) {
   if (auto glob = module.lookupSymbol<cc::GlobalOp>(name))
     return glob;
   auto *ctx = builder.getContext();
   OpBuilder::InsertionGuard guard(builder);
   builder.setInsertionPointToEnd(module.getBody());
-  auto globalTy = cc::ArrayType::get(ctx, eleTy, values.size());
-
-  auto arrayAttr = createDenseElementsAttr(values, eleTy);
+  auto globalTy = cc::ArrayType::get(ctx, eleTy, arrayAttr.size());
   return builder.create<cudaq::cc::GlobalOp>(loc, globalTy, name, arrayAttr,
                                              /*constant=*/true,
                                              /*external=*/false);
 }
 
+template <typename A>
+cc::GlobalOp buildVectorOfConstantElements(Location loc, ModuleOp module,
+                                           StringRef name,
+                                           const SmallVectorImpl<A> &values,
+                                           IRBuilder &builder, Type eleTy) {
+  auto arrayAttr = createDenseElementsAttr(values, eleTy);
+  return buildVectorOfConstantElements(loc, module, name, arrayAttr, builder,
+                                       eleTy);
+}
+
+cc::GlobalOp IRBuilder::genVectorOfConstants(Location loc, ModuleOp module,
+                                             StringRef name,
+                                             DenseElementsAttr values,
+                                             Type elementType) {
+  return buildVectorOfConstantElements(loc, module, name, values, *this,
+                                       elementType);
+}
+
 cc::GlobalOp IRBuilder::genVectorOfConstants(
     Location loc, ModuleOp module, StringRef name,
-    const std::vector<std::complex<double>> &values) {
+    const SmallVectorImpl<std::complex<double>> &values) {
   return buildVectorOfConstantElements(loc, module, name, values, *this,
                                        ComplexType::get(getF64Type()));
 }
 
 cc::GlobalOp IRBuilder::genVectorOfConstants(
     Location loc, ModuleOp module, StringRef name,
-    const std::vector<std::complex<float>> &values) {
+    const SmallVectorImpl<std::complex<float>> &values) {
   return buildVectorOfConstantElements(loc, module, name, values, *this,
                                        ComplexType::get(getF32Type()));
 }
 
 cc::GlobalOp
 IRBuilder::genVectorOfConstants(Location loc, ModuleOp module, StringRef name,
-                                const std::vector<double> &values) {
+                                const SmallVectorImpl<double> &values) {
   return buildVectorOfConstantElements(loc, module, name, values, *this,
                                        getF64Type());
 }
 
-cc::GlobalOp IRBuilder::genVectorOfConstants(Location loc, ModuleOp module,
-                                             StringRef name,
-                                             const std::vector<float> &values) {
+cc::GlobalOp
+IRBuilder::genVectorOfConstants(Location loc, ModuleOp module, StringRef name,
+                                const SmallVectorImpl<float> &values) {
   return buildVectorOfConstantElements(loc, module, name, values, *this,
                                        getF32Type());
 }
 
 cc::GlobalOp
 IRBuilder::genVectorOfConstants(Location loc, ModuleOp module, StringRef name,
-                                const std::vector<std::int64_t> &values) {
+                                const SmallVectorImpl<std::int64_t> &values) {
   return buildVectorOfConstantElements(loc, module, name, values, *this,
                                        getI64Type());
 }
 
 cc::GlobalOp
 IRBuilder::genVectorOfConstants(Location loc, ModuleOp module, StringRef name,
-                                const std::vector<std::int32_t> &values) {
+                                const SmallVectorImpl<std::int32_t> &values) {
   return buildVectorOfConstantElements(loc, module, name, values, *this,
                                        getI32Type());
 }
 
 cc::GlobalOp
 IRBuilder::genVectorOfConstants(Location loc, ModuleOp module, StringRef name,
-                                const std::vector<std::int16_t> &values) {
+                                const SmallVectorImpl<std::int16_t> &values) {
   return buildVectorOfConstantElements(loc, module, name, values, *this,
                                        getI16Type());
 }
 
 cc::GlobalOp
 IRBuilder::genVectorOfConstants(Location loc, ModuleOp module, StringRef name,
-                                const std::vector<std::int8_t> &values) {
+                                const SmallVectorImpl<std::int8_t> &values) {
   return buildVectorOfConstantElements(loc, module, name, values, *this,
                                        getI8Type());
 }
 
-cc::GlobalOp IRBuilder::genVectorOfConstants(Location loc, ModuleOp module,
-                                             StringRef name,
-                                             const std::vector<bool> &values) {
+cc::GlobalOp
+IRBuilder::genVectorOfConstants(Location loc, ModuleOp module, StringRef name,
+                                const SmallVectorImpl<bool> &values) {
   return buildVectorOfConstantElements(loc, module, name, values, *this,
                                        getI1Type());
 }
diff --git a/lib/Optimizer/CodeGen/Pipelines.cpp b/lib/Optimizer/CodeGen/Pipelines.cpp
index 247805fd2b..9a46b0c5b0 100644
--- a/lib/Optimizer/CodeGen/Pipelines.cpp
+++ b/lib/Optimizer/CodeGen/Pipelines.cpp
@@ -47,7 +47,8 @@ void cudaq::opt::addPipelineTranslateToOpenQASM(PassManager &pm) {
   pm.addNestedPass<func::FuncOp>(createClassicalMemToReg());
   pm.addPass(createLoopUnroll());
   pm.addPass(createCanonicalizerPass());
-  pm.addPass(createLiftArrayAlloc());
+  pm.addNestedPass<func::FuncOp>(createLiftArrayAlloc());
+  pm.addPass(createGlobalizeArrayValues());
   pm.addPass(createStatePreparation());
 }
 
diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
index a6b94d9a59..d906b749e6 100644
--- a/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -30,6 +30,7 @@ add_cudaq_library(OptTransforms
   GenKernelExecution.cpp
   GenDeviceCodeLoader.cpp
   GetConcreteMatrix.cpp
+  GlobalizeArrayValues.cpp
   LambdaLifting.cpp
   LiftArrayAlloc.cpp
   LinearCtrlRelations.cpp
diff --git a/lib/Optimizer/Transforms/ConstPropComplex.cpp b/lib/Optimizer/Transforms/ConstPropComplex.cpp
index 939634bf83..3434fc5ff5 100644
--- a/lib/Optimizer/Transforms/ConstPropComplex.cpp
+++ b/lib/Optimizer/Transforms/ConstPropComplex.cpp
@@ -172,29 +172,22 @@ class ConstPropComplexPass
 
   void runOnOperation() override {
     auto *ctx = &getContext();
-    auto module = getOperation();
-    for (Operation &op : *module.getBody()) {
-      auto func = dyn_cast<func::FuncOp>(op);
-      if (!func)
-        continue;
-      DominanceInfo domInfo(func);
-      std::string funcName = func.getName().str();
-      RewritePatternSet patterns(ctx);
-      patterns
-          .insert<ComplexCreatePattern, FloatCastPattern, FloatExtendPattern,
-                  FloatTruncatePattern, ComplexRePattern, ComplexImPattern>(
-              ctx);
-
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Before lifting constant array: " << func << '\n');
-
-      if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
-                                              std::move(patterns))))
-        signalPassFailure();
-
-      LLVM_DEBUG(llvm::dbgs()
-                 << "After lifting constant array: " << func << '\n');
-    }
+    auto func = getOperation();
+    DominanceInfo domInfo(func);
+    RewritePatternSet patterns(ctx);
+    patterns.insert<ComplexCreatePattern, FloatCastPattern, FloatExtendPattern,
+                    FloatTruncatePattern, ComplexRePattern, ComplexImPattern>(
+        ctx);
+
+    LLVM_DEBUG(llvm::dbgs()
+               << "Before lifting constant array: " << func << '\n');
+
+    if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
+                                            std::move(patterns))))
+      signalPassFailure();
+
+    LLVM_DEBUG(llvm::dbgs()
+               << "After lifting constant array: " << func << '\n');
   }
 };
 } // namespace
diff --git a/lib/Optimizer/Transforms/GlobalizeArrayValues.cpp b/lib/Optimizer/Transforms/GlobalizeArrayValues.cpp
new file mode 100644
index 0000000000..41fe47445d
--- /dev/null
+++ b/lib/Optimizer/Transforms/GlobalizeArrayValues.cpp
@@ -0,0 +1,138 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "PassDetails.h"
+#include "cudaq/Optimizer/Builder/Intrinsics.h"
+#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+#include "cudaq/Optimizer/Transforms/Passes.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/Passes.h"
+
+namespace cudaq::opt {
+#define GEN_PASS_DEF_GLOBALIZEARRAYVALUES
+#include "cudaq/Optimizer/Transforms/Passes.h.inc"
+} // namespace cudaq::opt
+
+#define DEBUG_TYPE "globalize-array-values"
+
+using namespace mlir;
+
+template <typename A, typename B>
+SmallVector<A> conversion(ArrayAttr seq) {
+  SmallVector<A> result;
+  for (auto v : seq) {
+    B c = cast<B>(v);
+    result.emplace_back(c.getValue());
+  }
+  return result;
+}
+template <>
+SmallVector<std::complex<APFloat>>
+conversion<std::complex<APFloat>, ArrayAttr>(ArrayAttr seq) {
+  SmallVector<std::complex<APFloat>> result;
+  for (auto v : seq) {
+    auto p = cast<ArrayAttr>(v);
+    result.emplace_back(cast<FloatAttr>(p[0]).getValue(),
+                        cast<FloatAttr>(p[1]).getValue());
+  }
+  return result;
+}
+
+static LogicalResult
+convertArrayAttrToGlobalConstant(MLIRContext *ctx, Location loc,
+                                 ArrayAttr arrAttr, ModuleOp module,
+                                 StringRef globalName, Type eleTy) {
+  cudaq::IRBuilder irBuilder(ctx);
+  auto tensorTy = RankedTensorType::get(arrAttr.size(), eleTy);
+  if (isa<ComplexType>(eleTy)) {
+    auto blockValues = conversion<std::complex<APFloat>, ArrayAttr>(arrAttr);
+    auto dense = DenseElementsAttr::get(tensorTy, blockValues);
+    irBuilder.genVectorOfConstants(loc, module, globalName, dense, eleTy);
+  } else if (isa<FloatType>(eleTy)) {
+    auto blockValues = conversion<APFloat, FloatAttr>(arrAttr);
+    auto dense = DenseElementsAttr::get(tensorTy, blockValues);
+    irBuilder.genVectorOfConstants(loc, module, globalName, dense, eleTy);
+  } else if (isa<IntegerType>(eleTy)) {
+    auto blockValues = conversion<APInt, IntegerAttr>(arrAttr);
+    auto dense = DenseElementsAttr::get(tensorTy, blockValues);
+    irBuilder.genVectorOfConstants(loc, module, globalName, dense, eleTy);
+  } else {
+    return failure();
+  }
+  return success();
+}
+
+namespace {
+struct ConstantArrayPattern
+    : public OpRewritePattern<cudaq::cc::ConstantArrayOp> {
+  explicit ConstantArrayPattern(MLIRContext *ctx, ModuleOp module,
+                                unsigned &counter)
+      : OpRewritePattern{ctx}, module{module}, counter{counter} {}
+
+  LogicalResult matchAndRewrite(cudaq::cc::ConstantArrayOp conarr,
+                                PatternRewriter &rewriter) const override {
+    if (!conarr->hasOneUse())
+      return failure();
+    auto store = dyn_cast<cudaq::cc::StoreOp>(*conarr->getUsers().begin());
+    if (!store)
+      return failure();
+    auto alloca = store.getPtrvalue().getDefiningOp<cudaq::cc::AllocaOp>();
+    if (!alloca)
+      return failure();
+    auto func = conarr->getParentOfType<func::FuncOp>();
+    if (!func)
+      return failure();
+    std::string globalName =
+        func.getName().str() + ".rodata_" + std::to_string(counter++);
+    auto *ctx = rewriter.getContext();
+    auto valueAttr = conarr.getConstantValues();
+    auto eleTy = cast<cudaq::cc::ArrayType>(conarr.getType()).getElementType();
+    if (failed(convertArrayAttrToGlobalConstant(ctx, conarr.getLoc(), valueAttr,
+                                                module, globalName, eleTy)))
+      return failure();
+    rewriter.replaceOpWithNewOp<cudaq::cc::AddressOfOp>(
+        alloca, alloca.getType(), globalName);
+    rewriter.eraseOp(store);
+    rewriter.eraseOp(conarr);
+    return success();
+  }
+
+  ModuleOp module;
+  unsigned &counter;
+};
+
+class GlobalizeArrayValuesPass
+    : public cudaq::opt::impl::GlobalizeArrayValuesBase<
+          GlobalizeArrayValuesPass> {
+public:
+  using GlobalizeArrayValuesBase::GlobalizeArrayValuesBase;
+
+  void runOnOperation() override {
+    auto *ctx = &getContext();
+    ModuleOp module = getOperation();
+
+    // Make the unchecked assumption that a ConstArrayOp was added by the
+    // LiftArrayAlloc pass. This assumption means that the backing store of the
+    // ConstArrayOp has been checked that it is never written to.
+    RewritePatternSet patterns(ctx);
+    unsigned counter = 0;
+    patterns.insert<ConstantArrayPattern>(ctx, module, counter);
+    LLVM_DEBUG(llvm::dbgs() << "Before globalizing array values:\n"
+                            << module << '\n');
+    if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
+      signalPassFailure();
+    LLVM_DEBUG(llvm::dbgs() << "After globalizing array values:\n"
+                            << module << '\n');
+  }
+};
+} // namespace
diff --git a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
index 689be49998..81884a3fc3 100644
--- a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
+++ b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
@@ -27,84 +27,21 @@ namespace cudaq::opt {
 
 using namespace mlir;
 
-namespace {
-template <typename A>
-std::vector<A> readConstantValues(SmallVectorImpl<Attribute> &vec, Type eleTy) {
-  std::vector<A> result;
-  for (auto a : vec) {
-    if constexpr (std::is_same_v<A, std::complex<double>>) {
-      auto v = cast<ArrayAttr>(a);
-      result.emplace_back(cast<FloatAttr>(v[0]).getValue().convertToDouble(),
-                          cast<FloatAttr>(v[1]).getValue().convertToDouble());
-    } else if constexpr (std::is_same_v<A, std::complex<float>>) {
-      auto v = cast<ArrayAttr>(a);
-      result.emplace_back(cast<FloatAttr>(v[0]).getValue().convertToFloat(),
-                          cast<FloatAttr>(v[1]).getValue().convertToFloat());
-    } else if constexpr (std::is_same_v<A, double>) {
-      auto v = cast<FloatAttr>(a);
-      result.emplace_back(v.getValue().convertToDouble());
-    } else if constexpr (std::is_same_v<A, float>) {
-      auto v = cast<FloatAttr>(a);
-      result.emplace_back(v.getValue().convertToFloat());
-    }
-  }
-  return result;
-}
-
-LogicalResult genVectorOfConstantsFromAttributes(cudaq::IRBuilder irBuilder,
-                                                 Location loc, ModuleOp module,
-                                                 StringRef name,
-                                                 SmallVector<Attribute> &values,
-                                                 Type eleTy) {
-
-  if (auto cTy = dyn_cast<ComplexType>(eleTy)) {
-    auto floatTy = cTy.getElementType();
-    if (floatTy == irBuilder.getF64Type()) {
-      auto vals = readConstantValues<std::complex<double>>(values, cTy);
-      if (vals.size() == values.size()) {
-        irBuilder.genVectorOfConstants(loc, module, name, vals);
-        return success();
-      }
-    } else if (floatTy == irBuilder.getF32Type()) {
-      auto vals = readConstantValues<std::complex<float>>(values, cTy);
-      if (vals.size() == values.size()) {
-        irBuilder.genVectorOfConstants(loc, module, name, vals);
-        return success();
-      }
-    }
-  } else if (eleTy == irBuilder.getF64Type()) {
-    auto vals = readConstantValues<double>(values, eleTy);
-    if (vals.size() == values.size()) {
-      irBuilder.genVectorOfConstants(loc, module, name, vals);
-      return success();
-    }
-  } else if (eleTy == irBuilder.getF32Type()) {
-    auto vals = readConstantValues<float>(values, eleTy);
-    if (vals.size() == values.size()) {
-      irBuilder.genVectorOfConstants(loc, module, name, vals);
-      return success();
-    }
-  }
-  return failure();
-}
-} // namespace
-
 namespace {
 class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
 public:
-  explicit AllocaPattern(MLIRContext *ctx, DominanceInfo &di,
-                         const std::string &fn, ModuleOp m)
-      : OpRewritePattern(ctx), dom(di), funcName(fn), module(m) {}
+  explicit AllocaPattern(MLIRContext *ctx, DominanceInfo &di, StringRef fn)
+      : OpRewritePattern(ctx), dom(di), funcName(fn) {}
 
   LogicalResult matchAndRewrite(cudaq::cc::AllocaOp alloc,
                                 PatternRewriter &rewriter) const override {
     SmallVector<Operation *> stores;
-    bool toGlobal = false;
-    if (!isGoodCandidate(alloc, stores, dom, toGlobal))
+    if (!isGoodCandidate(alloc, stores, dom))
       return failure();
 
     LLVM_DEBUG(llvm::dbgs() << "Candidate was found\n");
-    auto arrTy = cast<cudaq::cc::ArrayType>(alloc.getElementType());
+    auto eleTy = alloc.getElementType();
+    auto arrTy = cast<cudaq::cc::ArrayType>(eleTy);
     SmallVector<Attribute> values;
 
     // Every element of `stores` must be a cc::StoreOp with a ConstantOp as the
@@ -121,82 +58,60 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
     }
 
     // Create the cc.const_array.
-    auto eleTy = arrTy.getElementType();
     auto valuesAttr = rewriter.getArrayAttr(values);
     auto loc = alloc.getLoc();
-    Value conArr;
-    Value conGlobal;
-    if (toGlobal) {
-      static unsigned counter = 0;
-      auto ptrTy = cudaq::cc::PointerType::get(arrTy);
-      // Build a new name based on the kernel name.
-      std::string name = funcName + ".rodata_" + std::to_string(counter++);
-      cudaq::IRBuilder irBuilder(rewriter.getContext());
-      if (succeeded(genVectorOfConstantsFromAttributes(irBuilder, loc, module,
-                                                       name, values, eleTy))) {
-        conGlobal = rewriter.create<cudaq::cc::AddressOfOp>(loc, ptrTy, name);
-        conArr = rewriter.create<cudaq::cc::LoadOp>(loc, arrTy, conGlobal);
-      } else {
-        conArr =
-            rewriter.create<cudaq::cc::ConstantArrayOp>(loc, arrTy, valuesAttr);
-      }
-    } else {
-      conArr =
-          rewriter.create<cudaq::cc::ConstantArrayOp>(loc, arrTy, valuesAttr);
-    }
+    Value conArr =
+        rewriter.create<cudaq::cc::ConstantArrayOp>(loc, arrTy, valuesAttr);
 
     assert(conArr && "must have created the constant array");
     LLVM_DEBUG(llvm::dbgs() << "constant array is:\n" << conArr << '\n');
     bool cannotEraseAlloc = false;
 
+    // Collect all the stores, casts, and compute_ptr to be erased safely and in
+    // topological order.
+    SmallVector<Operation *> opsToErase;
+    auto insertOpToErase = [&](Operation *op) {
+      auto iter = std::find(opsToErase.begin(), opsToErase.end(), op);
+      if (iter == opsToErase.end())
+        opsToErase.push_back(op);
+    };
+
     // Rewalk all the uses of alloc, u, which must be cc.cast or cc.compute_ptr.
-    // For each,u, remove a store and replace a load with a cc.extract_value.
-    for (auto &use : alloc->getUses()) {
-      auto *user = use.getOwner();
+    // For each u remove a store and replace a load with a cc.extract_value.
+    for (auto *user : alloc->getUsers()) {
+      if (!user)
+        continue;
       std::int32_t offset = 0;
       if (auto cptr = dyn_cast<cudaq::cc::ComputePtrOp>(user))
         offset = cptr.getRawConstantIndices()[0];
       bool isLive = false;
-      for (auto &useuse : user->getUses()) {
-        auto *useuser = useuse.getOwner();
-        if (auto ist = dyn_cast<quake::InitializeStateOp>(useuser)) {
-          rewriter.setInsertionPointAfter(useuser);
-          LLVM_DEBUG(llvm::dbgs() << "replaced init_state\n");
-          assert(conGlobal && "global must be defined");
-          rewriter.replaceOpWithNewOp<quake::InitializeStateOp>(
-              ist, ist.getType(), ist.getTargets(), conGlobal);
-          continue;
-        }
-        if (auto load = dyn_cast<cudaq::cc::LoadOp>(useuser)) {
-          rewriter.setInsertionPointAfter(useuser);
-          LLVM_DEBUG(llvm::dbgs() << "replaced load\n");
-          rewriter.replaceOpWithNewOp<cudaq::cc::ExtractValueOp>(
-              load, eleTy, conArr,
-              ArrayRef<cudaq::cc::ExtractValueArg>{offset});
-          continue;
-        }
-        if (isa<cudaq::cc::StoreOp>(useuser))
-          rewriter.eraseOp(useuser);
-        LLVM_DEBUG(llvm::dbgs() << "alloc is live\n");
+      if (!isa<cudaq::cc::CastOp, cudaq::cc::ComputePtrOp>(user)) {
         cannotEraseAlloc = isLive = true;
-      }
-      if (auto ist = dyn_cast<quake::InitializeStateOp>(user)) {
-        rewriter.setInsertionPointAfter(user);
-        LLVM_DEBUG(llvm::dbgs() << "replaced init_state\n");
-        assert(conGlobal && "global must be defined");
-        rewriter.replaceOpWithNewOp<quake::InitializeStateOp>(
-            ist, ist.getType(), ist.getTargets(), conGlobal);
-        continue;
+      } else {
+        for (auto *useuser : user->getUsers()) {
+          if (auto load = dyn_cast<cudaq::cc::LoadOp>(useuser)) {
+            rewriter.setInsertionPointAfter(useuser);
+            LLVM_DEBUG(llvm::dbgs() << "replaced load\n");
+            rewriter.replaceOpWithNewOp<cudaq::cc::ExtractValueOp>(
+                load, eleTy, conArr,
+                ArrayRef<cudaq::cc::ExtractValueArg>{offset});
+            continue;
+          }
+          if (isa<cudaq::cc::StoreOp>(useuser)) {
+            insertOpToErase(useuser);
+            continue;
+          }
+          LLVM_DEBUG(llvm::dbgs() << "alloc is live\n");
+          cannotEraseAlloc = isLive = true;
+        }
       }
       if (!isLive)
-        rewriter.eraseOp(user);
+        insertOpToErase(user);
     }
 
-    if (toGlobal && conGlobal) {
-      rewriter.setInsertionPointAfter(alloc);
-      rewriter.replaceOp(alloc, conGlobal);
-      return success();
-    }
+    for (auto *e : opsToErase)
+      rewriter.eraseOp(e);
+
     if (cannotEraseAlloc) {
       rewriter.setInsertionPointAfter(alloc);
       rewriter.create<cudaq::cc::StoreOp>(loc, conArr, alloc);
@@ -210,14 +125,11 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
   // array value. \p scoreboard is a vector of store operations. Each element of
   // the allocated array must be written to exactly 1 time, and the scoreboard
   // is used to track these stores. \p dom is the dominance info for this
-  // function (to ensure the stores happen before uses). \p toGlobal is returned
-  // as a result. If it is `true`, then the constant array shall be lowered to a
-  // global variable rather than an inline constant array.
+  // function (to ensure the stores happen before uses).
   static bool isGoodCandidate(cudaq::cc::AllocaOp alloc,
                               SmallVectorImpl<Operation *> &scoreboard,
-                              DominanceInfo &dom, bool &toGlobal) {
+                              DominanceInfo &dom) {
     LLVM_DEBUG(llvm::dbgs() << "checking candidate\n");
-    toGlobal = false;
     if (alloc.getSeqSize())
       return false;
     auto arrTy = dyn_cast<cudaq::cc::ArrayType>(alloc.getElementType());
@@ -262,7 +174,6 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
         }
         if (isa<quake::InitializeStateOp>(u)) {
           toGlobalUses.push_back(u);
-          toGlobal = true;
           continue;
         }
         if (isa<cudaq::cc::LoadOp>(u)) {
@@ -311,7 +222,6 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
             Operation *u = use.getOwner();
             if (isa_and_present<quake::InitializeStateOp>(u)) {
               toGlobalUses.push_back(op);
-              toGlobal = true;
               continue;
             }
           }
@@ -319,17 +229,14 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
         }
         LLVM_DEBUG(llvm::dbgs() << "unexpected cast: " << *op << '\n');
         toGlobalUses.push_back(op);
-        toGlobal = true;
         continue;
       }
       if (isa<quake::InitializeStateOp>(op)) {
         toGlobalUses.push_back(op);
-        toGlobal = true;
         continue;
       }
       LLVM_DEBUG(llvm::dbgs() << "unexpected use: " << *op << '\n');
       toGlobalUses.push_back(op);
-      toGlobal = true;
     }
 
     bool ok = std::all_of(scoreboard.begin(), scoreboard.end(),
@@ -365,8 +272,7 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
   }
 
   DominanceInfo &dom;
-  const std::string &funcName;
-  mutable ModuleOp module;
+  StringRef funcName;
 };
 
 class LiftArrayAllocPass
@@ -376,26 +282,20 @@ class LiftArrayAllocPass
 
   void runOnOperation() override {
     auto *ctx = &getContext();
-    auto module = getOperation();
-    for (Operation &op : *module.getBody()) {
-      auto func = dyn_cast<func::FuncOp>(op);
-      if (!func)
-        continue;
-      DominanceInfo domInfo(func);
-      std::string funcName = func.getName().str();
-      RewritePatternSet patterns(ctx);
-      patterns.insert<AllocaPattern>(ctx, domInfo, funcName, module);
+    auto func = getOperation();
+    DominanceInfo domInfo(func);
+    StringRef funcName = func.getName();
+    RewritePatternSet patterns(ctx);
+    patterns.insert<AllocaPattern>(ctx, domInfo, funcName);
 
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Before lifting constant array: " << func << '\n');
+    LLVM_DEBUG(llvm::dbgs()
+               << "Before lifting constant array: " << func << '\n');
 
-      if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
-                                              std::move(patterns))))
-        signalPassFailure();
+    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
+      signalPassFailure();
 
-      LLVM_DEBUG(llvm::dbgs()
-                 << "After lifting constant array: " << func << '\n');
-    }
+    LLVM_DEBUG(llvm::dbgs()
+               << "After lifting constant array: " << func << '\n');
   }
 };
 } // namespace
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index adbe9df29e..166f558275 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -118,7 +118,7 @@ static bool hasInitStateUse(BlockArgument argument) {
 template <typename ELETY, typename T, typename ATTR, typename MAKER>
 LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
-                         BlockArgument argument, std::vector<T> &vec,
+                         BlockArgument argument, SmallVectorImpl<T> &vec,
                          ATTR arrayAttr, MAKER makeElementValue) {
   auto *ctx = builder.getContext();
   auto argTy = argument.getType();
@@ -273,62 +273,60 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
 }
 
 template <typename A>
-std::vector<std::int32_t> asI32(const std::vector<A> &v) {
-  std::vector<std::int32_t> result(v.size());
-  for (auto iter : llvm::enumerate(v))
-    result[iter.index()] = static_cast<std::int32_t>(iter.value());
-  return result;
+SmallVector<Attribute> asIntAttr(MLIRContext *ctx, unsigned bits,
+                                 const SmallVectorImpl<A> &vec) {
+  return llvm::to_vector<8>(llvm::map_range(vec, [=](A v) -> Attribute {
+    return IntegerAttr::get(IntegerType::get(ctx, bits), APInt(bits, v));
+  }));
 }
 
 // TODO: consider using DenseArrayAttr here instead. NB: such a change may alter
 // the output of the constant array op.
 static LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
-                         BlockArgument argument, std::vector<bool> &vec) {
-  auto arrayAttr = builder.getI32ArrayAttr(asI32(vec));
+                         BlockArgument argument, SmallVectorImpl<bool> &vec) {
+  auto arrayAttr = builder.getBoolArrayAttr(vec);
   return synthesizeVectorArgument<IntegerType>(builder, module, counter,
                                                argument, vec, arrayAttr,
                                                makeIntegerElement<bool>);
 }
 
-static LogicalResult synthesizeVectorArgument(OpBuilder &builder,
-                                              ModuleOp module,
-                                              unsigned &counter,
-                                              BlockArgument argument,
-                                              std::vector<std::int8_t> &vec) {
-  auto arrayAttr = builder.getI32ArrayAttr(asI32(vec));
+static LogicalResult
+synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
+                         BlockArgument argument,
+                         SmallVectorImpl<std::int8_t> &vec) {
+  auto arrayAttr =
+      builder.getArrayAttr(asIntAttr(builder.getContext(), 8, vec));
   return synthesizeVectorArgument<IntegerType>(builder, module, counter,
                                                argument, vec, arrayAttr,
                                                makeIntegerElement<std::int8_t>);
 }
 
-static LogicalResult synthesizeVectorArgument(OpBuilder &builder,
-                                              ModuleOp module,
-                                              unsigned &counter,
-                                              BlockArgument argument,
-                                              std::vector<std::int16_t> &vec) {
-  auto arrayAttr = builder.getI32ArrayAttr(asI32(vec));
+static LogicalResult
+synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
+                         BlockArgument argument,
+                         SmallVectorImpl<std::int16_t> &vec) {
+  auto arrayAttr =
+      builder.getArrayAttr(asIntAttr(builder.getContext(), 16, vec));
   return synthesizeVectorArgument<IntegerType>(
       builder, module, counter, argument, vec, arrayAttr,
       makeIntegerElement<std::int16_t>);
 }
 
-static LogicalResult synthesizeVectorArgument(OpBuilder &builder,
-                                              ModuleOp module,
-                                              unsigned &counter,
-                                              BlockArgument argument,
-                                              std::vector<std::int32_t> &vec) {
+static LogicalResult
+synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
+                         BlockArgument argument,
+                         SmallVectorImpl<std::int32_t> &vec) {
   auto arrayAttr = builder.getI32ArrayAttr(vec);
   return synthesizeVectorArgument<IntegerType>(
       builder, module, counter, argument, vec, arrayAttr,
       makeIntegerElement<std::int32_t>);
 }
 
-static LogicalResult synthesizeVectorArgument(OpBuilder &builder,
-                                              ModuleOp module,
-                                              unsigned &counter,
-                                              BlockArgument argument,
-                                              std::vector<std::int64_t> &vec) {
+static LogicalResult
+synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
+                         BlockArgument argument,
+                         SmallVectorImpl<std::int64_t> &vec) {
   auto arrayAttr = builder.getI64ArrayAttr(vec);
   return synthesizeVectorArgument<IntegerType>(
       builder, module, counter, argument, vec, arrayAttr,
@@ -337,7 +335,7 @@ static LogicalResult synthesizeVectorArgument(OpBuilder &builder,
 
 static LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
-                         BlockArgument argument, std::vector<float> &vec) {
+                         BlockArgument argument, SmallVectorImpl<float> &vec) {
   auto arrayAttr = builder.getF32ArrayAttr(vec);
   return synthesizeVectorArgument<FloatType>(builder, module, counter, argument,
                                              vec, arrayAttr,
@@ -346,7 +344,7 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
 
 static LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
-                         BlockArgument argument, std::vector<double> &vec) {
+                         BlockArgument argument, SmallVectorImpl<double> &vec) {
   auto arrayAttr = builder.getF64ArrayAttr(vec);
   return synthesizeVectorArgument<FloatType>(builder, module, counter, argument,
                                              vec, arrayAttr,
@@ -356,8 +354,8 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
 static LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
                          BlockArgument argument,
-                         std::vector<std::complex<float>> &vec) {
-  std::vector<float> vec2;
+                         SmallVectorImpl<std::complex<float>> &vec) {
+  SmallVector<float> vec2;
   for (auto c : vec) {
     vec2.push_back(c.real());
     vec2.push_back(c.imag());
@@ -371,8 +369,8 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
 static LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
                          BlockArgument argument,
-                         std::vector<std::complex<double>> &vec) {
-  std::vector<double> vec2;
+                         SmallVectorImpl<std::complex<double>> &vec) {
+  SmallVector<double> vec2;
   for (auto c : vec) {
     vec2.push_back(c.real());
     vec2.push_back(c.imag());
@@ -410,7 +408,7 @@ class QuakeSynthesizer
 
   mlir::ModuleOp getModule() { return getOperation(); }
 
-  std::pair<std::size_t, std::vector<std::size_t>>
+  std::pair<std::size_t, SmallVector<std::size_t>>
   getTargetLayout(FunctionType funcTy) {
     auto bufferTy =
         cudaq::opt::factory::buildInvokeStructType(funcTy, startingArgIdx);
@@ -429,7 +427,7 @@ class QuakeSynthesizer
         cast<llvm::StructType>(translator.translateType(llvmDialectTy));
     auto *layout = dataLayout.getStructLayout(llvmStructTy);
     auto strSize = layout->getSizeInBytes();
-    std::vector<std::size_t> fieldOffsets;
+    SmallVector<std::size_t> fieldOffsets;
     for (std::size_t i = 0, I = bufferTy.getMembers().size(); i != I; ++i)
       fieldOffsets.emplace_back(layout->getElementOffset(i));
     return {strSize, fieldOffsets};
@@ -462,7 +460,7 @@ class QuakeSynthesizer
     auto arguments = funcOp.getArguments();
     auto structLayout = getTargetLayout(funcOp.getFunctionType());
     // Keep track of the stdVec sizes.
-    std::vector<std::tuple<std::size_t, Type, std::uint64_t>> stdVecInfo;
+    SmallVector<std::tuple<std::size_t, Type, std::uint64_t>> stdVecInfo;
 
     for (std::size_t argNum = startingArgIdx, end = arguments.size();
          argNum < end; argNum++) {
@@ -656,7 +654,7 @@ class QuakeSynthesizer
       }
       auto doVector = [&]<typename T>(T) {
         auto *ptr = reinterpret_cast<const T *>(bufferAppendix);
-        std::vector<T> v(ptr, ptr + vecLength);
+        SmallVector<T> v(ptr, ptr + vecLength);
         if (failed(synthesizeVectorArgument(builder, module, counter,
                                             arguments[idx], v)))
           funcOp.emitOpError("synthesis failed for vector<T>");
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 5a197f97a6..90ba42b617 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -534,23 +534,24 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
   PassManager pm(context);
   pm.addNestedPass<func::FuncOp>(
       cudaq::opt::createArgumentSynthesisPass(kernels, substs));
-  pm.addPass(createCanonicalizerPass());
+  pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
 
   // Run state preparation for quantum devices (or their emulation) only.
   // Simulators have direct implementation of state initialization
   // in their runtime.
   if (!isSimulator) {
-    pm.addPass(cudaq::opt::createConstPropComplex());
-    pm.addPass(cudaq::opt::createLiftArrayAlloc());
+    pm.addNestedPass<func::FuncOp>(cudaq::opt::createConstPropComplex());
+    pm.addNestedPass<func::FuncOp>(cudaq::opt::createLiftArrayAlloc());
+    pm.addPass(cudaq::opt::createGlobalizeArrayValues());
     pm.addPass(cudaq::opt::createStatePreparation());
   }
-  pm.addPass(createCanonicalizerPass());
-  pm.addPass(cudaq::opt::createExpandMeasurementsPass());
+  pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+  pm.addNestedPass<func::FuncOp>(cudaq::opt::createExpandMeasurementsPass());
   pm.addNestedPass<func::FuncOp>(cudaq::opt::createClassicalMemToReg());
-  pm.addPass(createCanonicalizerPass());
-  pm.addPass(cudaq::opt::createLoopNormalize());
-  pm.addPass(cudaq::opt::createLoopUnroll());
-  pm.addPass(createCanonicalizerPass());
+  pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+  pm.addNestedPass<func::FuncOp>(cudaq::opt::createLoopNormalize());
+  pm.addNestedPass<func::FuncOp>(cudaq::opt::createLoopUnroll());
+  pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
   DefaultTimingManager tm;
   tm.setEnabled(cudaq::isTimingTagEnabled(cudaq::TIMING_JIT_PASSES));
   auto timingScope = tm.getRootScope(); // starts the timer
diff --git a/python/runtime/mlir/py_register_dialects.cpp b/python/runtime/mlir/py_register_dialects.cpp
index 3dd5a66ff3..9c0c4f2985 100644
--- a/python/runtime/mlir/py_register_dialects.cpp
+++ b/python/runtime/mlir/py_register_dialects.cpp
@@ -318,12 +318,15 @@ void bindRegisterDialects(py::module &mod) {
     mlirContext->loadAllAvailableDialects();
   });
 
-  mod.def("gen_vector_of_complex_constant",
-          [](MlirLocation loc, MlirModule module, std::string name,
-             const std::vector<std::complex<double>> &values) {
-            ModuleOp modOp = unwrap(module);
-            cudaq::IRBuilder builder = IRBuilder::atBlockEnd(modOp.getBody());
-            builder.genVectorOfConstants(unwrap(loc), modOp, name, values);
-          });
+  mod.def("gen_vector_of_complex_constant", [](MlirLocation loc,
+                                               MlirModule module,
+                                               std::string name,
+                                               const std::vector<std::complex<
+                                                   double>> &values) {
+    ModuleOp modOp = unwrap(module);
+    cudaq::IRBuilder builder = IRBuilder::atBlockEnd(modOp.getBody());
+    SmallVector<std::complex<double>> newValues{values.begin(), values.end()};
+    builder.genVectorOfConstants(unwrap(loc), modOp, name, newValues);
+  });
 }
 } // namespace cudaq
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index 424cbd8873..0de2589752 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -123,7 +123,7 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
 
     cudaq::IRBuilder irBuilder(ctx);
     auto genConArray = [&]<typename T>() -> Value {
-      std::vector<std::complex<T>> vec(size);
+      SmallVector<std::complex<T>> vec(size);
       for (std::size_t i = 0; i < size; i++) {
         vec[i] = (*v)({i}, 0);
       }
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index c65e94c3b6..641b445747 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -399,7 +399,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     for (auto &op : m_module.getOps()) {
       // Add any global symbols, including global constant arrays.
       // Global constant arrays can be created during compilation,
-      // `lift-array-value`, `quake-synthesizer`, and `get-concrete-matrix`
+      // `lift-array-alloc`, `quake-synthesizer`, and `get-concrete-matrix`
       // passes.
       if (auto globalOp = dyn_cast<cudaq::cc::GlobalOp>(op))
         moduleOp.push_back(globalOp.clone());
diff --git a/runtime/cudaq/platform/default/opt-test.yml b/runtime/cudaq/platform/default/opt-test.yml
index caa1532c53..eacf0375b2 100644
--- a/runtime/cudaq/platform/default/opt-test.yml
+++ b/runtime/cudaq/platform/default/opt-test.yml
@@ -22,19 +22,19 @@ configuration-matrix:
     config:
       nvqir-simulation-backend: cusvsim-fp32, custatevec-fp32
       preprocessor-defines: ["-D CUDAQ_SIMULATION_SCALAR_FP32"]
-      target-pass-pipeline: "func.func(unwind-lowering),canonicalize,lambda-lifting,func.func(memtoreg{quantum=0}),canonicalize,apply-op-specialization,kernel-execution,aggressive-early-inlining,func.func(quake-add-metadata),const-prop-complex,lift-array-value,func.func(get-concrete-matrix),device-code-loader{use-quake=1},canonicalize,cse,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),canonicalize,cse,add-wireset,func.func(assign-wire-indices),dep-analysis,func.func(regtomem),symbol-dce"
+      target-pass-pipeline: "func.func(unwind-lowering),canonicalize,lambda-lifting,func.func(memtoreg{quantum=0}),canonicalize,apply-op-specialization,kernel-execution,aggressive-early-inlining,func.func(quake-add-metadata,const-prop-complex,lift-array-alloc),globalize-array-values,func.func(get-concrete-matrix),device-code-loader{use-quake=1},canonicalize,cse,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),canonicalize,cse,add-wireset,func.func(assign-wire-indices),dep-analysis,func.func(regtomem),symbol-dce"
       library-mode: false
   - name: dep-analysis-fp64
     option-flags: [dep-analysis, fp64]
     config:
       nvqir-simulation-backend: cusvsim-fp64, custatevec-fp64
       preprocessor-defines: ["-D CUDAQ_SIMULATION_SCALAR_FP64"]
-      target-pass-pipeline: "func.func(unwind-lowering),canonicalize,lambda-lifting,func.func(memtoreg{quantum=0}),canonicalize,apply-op-specialization,kernel-execution,aggressive-early-inlining,func.func(quake-add-metadata),const-prop-complex,lift-array-value,func.func(get-concrete-matrix),device-code-loader{use-quake=1},canonicalize,cse,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),canonicalize,cse,add-wireset,func.func(assign-wire-indices),dep-analysis,func.func(regtomem),symbol-dce"
+      target-pass-pipeline: "func.func(unwind-lowering),canonicalize,lambda-lifting,func.func(memtoreg{quantum=0}),canonicalize,apply-op-specialization,kernel-execution,aggressive-early-inlining,func.func(quake-add-metadata,const-prop-complex,lift-array-alloc),globalize-array-values,func.func(get-concrete-matrix),device-code-loader{use-quake=1},canonicalize,cse,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),canonicalize,cse,add-wireset,func.func(assign-wire-indices),dep-analysis,func.func(regtomem),symbol-dce"
       library-mode: false
   - name: dep-analysis-qpp
     option-flags: [dep-analysis, qpp]
     config:
       nvqir-simulation-backend: qpp
       preprocessor-defines: ["-D CUDAQ_SIMULATION_SCALAR_FP64"]
-      target-pass-pipeline: "func.func(unwind-lowering),canonicalize,lambda-lifting,func.func(memtoreg{quantum=0}),canonicalize,apply-op-specialization,kernel-execution,aggressive-early-inlining,func.func(quake-add-metadata),const-prop-complex,lift-array-value,func.func(get-concrete-matrix),device-code-loader{use-quake=1},canonicalize,cse,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),canonicalize,cse,add-wireset,func.func(assign-wire-indices),dep-analysis,func.func(regtomem),symbol-dce"
+      target-pass-pipeline: "func.func(unwind-lowering),canonicalize,lambda-lifting,func.func(memtoreg{quantum=0}),canonicalize,apply-op-specialization,kernel-execution,aggressive-early-inlining,func.func(quake-add-metadata,const-prop-complex,lift-array-alloc),globalize-array-values,func.func(get-concrete-matrix),device-code-loader{use-quake=1},canonicalize,cse,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),canonicalize,cse,add-wireset,func.func(assign-wire-indices),dep-analysis,func.func(regtomem),symbol-dce"
       library-mode: false
diff --git a/runtime/cudaq/platform/default/rest/helpers/anyon/anyon.yml b/runtime/cudaq/platform/default/rest/helpers/anyon/anyon.yml
index 3ecb49f302..300bb038ee 100644
--- a/runtime/cudaq/platform/default/rest/helpers/anyon/anyon.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/anyon/anyon.yml
@@ -16,7 +16,7 @@ config:
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
-  platform-lowering-config: "const-prop-complex,canonicalize,cse,lift-array-value,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),anyon-%Q_GATE%-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
+  platform-lowering-config: "func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),anyon-%Q_GATE%-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
   # Tell the rest-qpu that we are generating Adaptive QIR.
   codegen-emission: qir-adaptive
   # Library mode is only for simulators, physical backends must turn this off
diff --git a/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.yml b/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.yml
index 238d4c3316..913c91a8e2 100644
--- a/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.yml
@@ -16,7 +16,7 @@ config:
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
-  platform-lowering-config: "const-prop-complex,canonicalize,cse,lift-array-value,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),ionq-gate-set-mapping"
+  platform-lowering-config: "func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),ionq-gate-set-mapping"
   # Tell the rest-qpu that we are generating QIR.
   codegen-emission: qir-base
   # Additional passes to run after lowering to QIR
diff --git a/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.yml b/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.yml
index 0e90a1e2af..841d4ea366 100644
--- a/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.yml
@@ -16,7 +16,7 @@ config:
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
-  platform-lowering-config: "const-prop-complex,canonicalize,cse,lift-array-value,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),iqm-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(delay-measurements,regtomem),symbol-dce,iqm-gate-set-mapping"
+  platform-lowering-config: "func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),iqm-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(delay-measurements,regtomem),symbol-dce,iqm-gate-set-mapping"
   # Tell the rest-qpu that we are generating IQM JSON.
   codegen-emission: iqm
   # Library mode is only for simulators, physical backends must turn this off
diff --git a/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.yml b/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.yml
index 6a8a46c066..fb02c57d96 100644
--- a/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.yml
@@ -16,7 +16,7 @@ config:
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
-  platform-lowering-config: "const-prop-complex,canonicalize,cse,lift-array-value,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),oqc-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
+  platform-lowering-config: "func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),oqc-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
   # Tell the rest-qpu that we are generating QIR.
   codegen-emission: qir-base
   # Library mode is only for simulators, physical backends must turn this off
diff --git a/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.yml b/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.yml
index 21cc45be1e..efd4eafdf6 100644
--- a/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.yml
@@ -16,7 +16,7 @@ config:
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
-  platform-lowering-config: "const-prop-complex,canonicalize,cse,lift-array-value,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),quantinuum-gate-set-mapping"
+  platform-lowering-config: "func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),quantinuum-gate-set-mapping"
   # Tell the rest-qpu that we are generating Adaptive QIR.
   codegen-emission: qir-adaptive
   # Library mode is only for simulators, physical backends must turn this off
diff --git a/targettests/Remote-Sim/qvector_init_from_vector.cpp b/targettests/Remote-Sim/qvector_init_from_vector.cpp
index c55ede9020..16db5bdbad 100644
--- a/targettests/Remote-Sim/qvector_init_from_vector.cpp
+++ b/targettests/Remote-Sim/qvector_init_from_vector.cpp
@@ -19,30 +19,28 @@
 
 __qpu__ void test_large_double_constant_array() {
   std::vector<double> vec(1ULL << 19);
-  vec[0]= M_SQRT1_2/vec.size();
-  vec[1]= M_SQRT1_2/vec.size();
+  vec[0] = M_SQRT1_2 / vec.size();
+  vec[1] = M_SQRT1_2 / vec.size();
   for (std::size_t i = 2; i < vec.size(); i++) {
-    vec[i]= 0;
+    vec[i] = 0;
   }
   cudaq::qvector v(vec);
 }
 
 __qpu__ void test_complex_constant_array() {
-   cudaq::qvector v(std::vector<cudaq::complex>({ M_SQRT1_2, M_SQRT1_2, 0., 0.}));
+  cudaq::qvector v(std::vector<cudaq::complex>({M_SQRT1_2, M_SQRT1_2, 0., 0.}));
 }
 
 __qpu__ void test_complex_constant_array2() {
-   cudaq::qvector v1(std::vector<cudaq::complex>({ M_SQRT1_2, M_SQRT1_2, 0., 0.}));
-   cudaq::qvector v2(std::vector<cudaq::complex>({ 0., 0., M_SQRT1_2, M_SQRT1_2}));
+  cudaq::qvector v1(
+      std::vector<cudaq::complex>({M_SQRT1_2, M_SQRT1_2, 0., 0.}));
+  cudaq::qvector v2(
+      std::vector<cudaq::complex>({0., 0., M_SQRT1_2, M_SQRT1_2}));
 }
 
 __qpu__ void test_complex_constant_array3() {
-   cudaq::qvector v({
-    cudaq::complex(M_SQRT1_2),
-    cudaq::complex(M_SQRT1_2),
-    cudaq::complex(0.0),
-    cudaq::complex(0.0)
-  });
+  cudaq::qvector v({cudaq::complex(M_SQRT1_2), cudaq::complex(M_SQRT1_2),
+                    cudaq::complex(0.0), cudaq::complex(0.0)});
 }
 
 __qpu__ void test_complex_array_param(std::vector<cudaq::complex> inState) {
@@ -50,7 +48,7 @@ __qpu__ void test_complex_array_param(std::vector<cudaq::complex> inState) {
 }
 
 __qpu__ void test_real_constant_array() {
-  cudaq::qvector v({ M_SQRT1_2, M_SQRT1_2, 0., 0.});
+  cudaq::qvector v({M_SQRT1_2, M_SQRT1_2, 0., 0.});
 }
 
 __qpu__ void test_real_array_param(std::vector<cudaq::real> inState) {
@@ -65,7 +63,7 @@ __qpu__ void test_float_array_param(std::vector<float> inState) {
   cudaq::qvector q = inState;
 }
 
-void printCounts(cudaq::sample_result& result) {
+void printCounts(cudaq::sample_result &result) {
   std::vector<std::string> values{};
   for (auto &&[bits, counts] : result) {
     values.push_back(bits);
@@ -78,155 +76,177 @@ void printCounts(cudaq::sample_result& result) {
 }
 
 int main() {
-    {
-      auto counts = cudaq::sample(test_large_double_constant_array);
-      printCounts(counts);
-    }
+  {
+    auto counts = cudaq::sample(test_large_double_constant_array);
+    std::cout << "Part 1\n";
+    printCounts(counts);
+  }
 
-// CHECK: 0000000000000000000
-// CHECK: 1000000000000000000
+  // CHECK-LABEL: Part 1
+  // CHECK: 0000000000000000000
+  // CHECK: 1000000000000000000
 
-    {
-      auto counts = cudaq::sample(test_complex_constant_array);
-      printCounts(counts);
-    }
+  {
+    auto counts = cudaq::sample(test_complex_constant_array);
+    std::cout << "Part 2\n";
+    printCounts(counts);
+  }
 
-// CHECK: 00
-// CHECK: 10
+  // CHECK-LABEL: Part 2
+  // CHECK: 00
+  // CHECK: 10
 
-    {
-      auto counts = cudaq::sample(test_complex_constant_array2);
-      printCounts(counts);
-    }
+  {
+    auto counts = cudaq::sample(test_complex_constant_array2);
+    std::cout << "Part 3\n";
+    printCounts(counts);
+  }
 
-// CHECK: 0001
-// CHECK: 0011
-// CHECK: 1001
-// CHECK: 1011
+  // CHECK-LABEL: Part 3
+  // CHECK: 0001
+  // CHECK: 0011
+  // CHECK: 1001
+  // CHECK: 1011
 
-    {
-      auto counts = cudaq::sample(test_complex_constant_array3);
-      printCounts(counts);
-    }
+  {
+    auto counts = cudaq::sample(test_complex_constant_array3);
+    std::cout << "Part 4\n";
+    printCounts(counts);
+  }
 
-// CHECK: 00
-// CHECK: 10
+  // CHECK-LABEL: Part 4
+  // CHECK: 00
+  // CHECK: 10
 
-    {
-      auto counts = cudaq::sample(test_real_constant_array);
-      printCounts(counts);
-    }
+  {
+    auto counts = cudaq::sample(test_real_constant_array);
+    std::cout << "Part 5\n";
+    printCounts(counts);
+  }
 
-// CHECK: 00
-// CHECK: 10
+  // CHECK-LABEL: Part 5
+  // CHECK: 00
+  // CHECK: 10
 
+  {
+    std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+    std::vector<cudaq::complex> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
     {
-      std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
-      std::vector<cudaq::complex> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
-      {
-          // Passing state data as argument (kernel mode)
-          auto counts = cudaq::sample(test_complex_array_param, vec);
-          printCounts(counts);
-
-          counts = cudaq::sample(test_complex_array_param, vec1);
-          printCounts(counts);
-      }
-
-// CHECK: 00
-// CHECK: 10
-
-// CHECK: 01
-// CHECK: 11
-
-      {
-          // Passing state data as argument (builder mode)
-          auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::complex>>();
-          auto qubits = kernel.qalloc(v);
-
-          auto counts = cudaq::sample(kernel, vec);
-          printCounts(counts);
-
-          counts = cudaq::sample(kernel, vec1);
-          printCounts(counts);
-      }
+      // Passing state data as argument (kernel mode)
+      auto counts = cudaq::sample(test_complex_array_param, vec);
+      std::cout << "Part 6\n";
+      printCounts(counts);
+
+      counts = cudaq::sample(test_complex_array_param, vec1);
+      printCounts(counts);
     }
 
-// CHECK: 00
-// CHECK: 10
+    // CHECK-LABEL: Part 6
+    // CHECK: 00
+    // CHECK: 10
 
-// CHECK: 01
-// CHECK: 11
+    // CHECK: 01
+    // CHECK: 11
 
     {
-      std::vector<cudaq::real> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
-      std::vector<cudaq::real> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
-      {
-          // Passing state data as argument (kernel mode)
-          auto counts = cudaq::sample(test_real_array_param, vec);
-          printCounts(counts);
+      // Passing state data as argument (builder mode)
+      auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::complex>>();
+      auto qubits = kernel.qalloc(v);
 
-          counts = cudaq::sample(test_real_array_param, vec1);
-          printCounts(counts);
-      }
-
-// CHECK: 00
-// CHECK: 10
-
-// CHECK: 01
-// CHECK: 11
-
-      {
-          // Passing state data as argument (builder mode)
-          auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::real>>();
-          auto qubits = kernel.qalloc(v);
-
-          auto counts = cudaq::sample(kernel, vec);
-          printCounts(counts);
+      auto counts = cudaq::sample(kernel, vec);
+      std::cout << "Part 7\n";
+      printCounts(counts);
 
-          counts = cudaq::sample(kernel, vec1);
-          printCounts(counts);
-      }
+      counts = cudaq::sample(kernel, vec1);
+      printCounts(counts);
+    }
+  }
 
-// CHECK: 00
-// CHECK: 10
+  // CHECK-LABEL: Part 7
+  // CHECK: 00
+  // CHECK: 10
 
-// CHECK: 01
-// CHECK: 11
-    }
+  // CHECK: 01
+  // CHECK: 11
 
+  {
+    std::vector<cudaq::real> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+    std::vector<cudaq::real> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
     {
-      std::vector<double> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
-      std::vector<double> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
-
       // Passing state data as argument (kernel mode)
-      auto counts = cudaq::sample(test_double_array_param, vec);
+      auto counts = cudaq::sample(test_real_array_param, vec);
+      std::cout << "Part 8\n";
       printCounts(counts);
 
-      counts = cudaq::sample(test_double_array_param, vec1);
+      counts = cudaq::sample(test_real_array_param, vec1);
       printCounts(counts);
     }
 
-// CHECK: 00
-// CHECK: 10
+    // CHECK-LABEL: Part 8
+    // CHECK: 00
+    // CHECK: 10
 
-// CHECK: 01
-// CHECK: 11
+    // CHECK: 01
+    // CHECK: 11
 
     {
-      std::vector<float> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
-      std::vector<float> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+      // Passing state data as argument (builder mode)
+      auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::real>>();
+      auto qubits = kernel.qalloc(v);
 
-      // Passing state data as argument (kernel mode)
-      auto counts = cudaq::sample(test_float_array_param, vec);
+      auto counts = cudaq::sample(kernel, vec);
+      std::cout << "Part 9\n";
       printCounts(counts);
 
-      counts = cudaq::sample(test_float_array_param, vec1);
+      counts = cudaq::sample(kernel, vec1);
       printCounts(counts);
     }
 
-// CHECK: 00
-// CHECK: 10
+    // CHECK-LABEL: Part 9
+    // CHECK: 00
+    // CHECK: 10
+
+    // CHECK: 01
+    // CHECK: 11
+  }
+
+  {
+    std::vector<double> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+    std::vector<double> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+
+    // Passing state data as argument (kernel mode)
+    auto counts = cudaq::sample(test_double_array_param, vec);
+    std::cout << "Part 10\n";
+    printCounts(counts);
+
+    counts = cudaq::sample(test_double_array_param, vec1);
+    printCounts(counts);
+  }
+
+  // CHECK-LABEL: Part 10
+  // CHECK: 00
+  // CHECK: 10
+
+  // CHECK: 01
+  // CHECK: 11
+
+  {
+    std::vector<float> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+    std::vector<float> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+
+    // Passing state data as argument (kernel mode)
+    auto counts = cudaq::sample(test_float_array_param, vec);
+    std::cout << "Part 11\n";
+    printCounts(counts);
+
+    counts = cudaq::sample(test_float_array_param, vec1);
+    printCounts(counts);
+  }
+
+  // CHECK-LABEL: Part 11
+  // CHECK: 00
+  // CHECK: 10
 
-// CHECK: 01
-// CHECK: 11
+  // CHECK: 01
+  // CHECK: 11
 }
diff --git a/targettests/TargetConfig/RegressionValidation/anyon.config b/targettests/TargetConfig/RegressionValidation/anyon.config
index a281c7a156..5c81c0c3e0 100644
--- a/targettests/TargetConfig/RegressionValidation/anyon.config
+++ b/targettests/TargetConfig/RegressionValidation/anyon.config
@@ -20,7 +20,7 @@
 # Define the lowering pipeline. telegraph-8q has an 8-qubit ring topology, so mapping
 # uses ring(8).
 # Berkeley-25q uses a bidiratctional connectivity lattice with 8 connectivity per qubit in the bulk.
-# CHECK-DAG: PLATFORM_LOWERING_CONFIG="const-prop-complex,canonicalize,cse,lift-array-value,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),anyon-%Q_GATE%-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
+# CHECK-DAG: PLATFORM_LOWERING_CONFIG="func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),anyon-%Q_GATE%-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
 
 
 # Tell the rest-qpu that we are generating QIR.
@@ -49,4 +49,4 @@ while [ $# -gt 1 ]; do
 		;;
 	esac
 	shift 2
-done
\ No newline at end of file
+done
diff --git a/targettests/TargetConfig/RegressionValidation/ionq.config b/targettests/TargetConfig/RegressionValidation/ionq.config
index ca18d8286a..1d11dbd351 100644
--- a/targettests/TargetConfig/RegressionValidation/ionq.config
+++ b/targettests/TargetConfig/RegressionValidation/ionq.config
@@ -18,7 +18,7 @@
 # CHECK-DAG: LINKLIBS="${LINKLIBS} -lcudaq-rest-qpu"
 
 # Define the lowering pipeline
-# CHECK-DAG: PLATFORM_LOWERING_CONFIG="const-prop-complex,canonicalize,cse,lift-array-value,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),ionq-gate-set-mapping"
+# CHECK-DAG: PLATFORM_LOWERING_CONFIG="func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),ionq-gate-set-mapping"
 
 # Tell the rest-qpu that we are generating QIR.
 # CHECK-DAG: CODEGEN_EMISSION=qir-base
diff --git a/targettests/TargetConfig/RegressionValidation/iqm.config b/targettests/TargetConfig/RegressionValidation/iqm.config
index 073e269408..4db04b874e 100644
--- a/targettests/TargetConfig/RegressionValidation/iqm.config
+++ b/targettests/TargetConfig/RegressionValidation/iqm.config
@@ -20,7 +20,7 @@
 # Define the lowering pipeline, here we lower to Base QIR
 # Note: the runtime will dynamically substitute %QPU_ARCH% based on
 # qpu-architecture
-# CHECK-DAG: PLATFORM_LOWERING_CONFIG="const-prop-complex,canonicalize,cse,lift-array-value,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),iqm-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(delay-measurements,regtomem),symbol-dce,iqm-gate-set-mapping"
+# CHECK-DAG: PLATFORM_LOWERING_CONFIG="func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),iqm-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(delay-measurements,regtomem),symbol-dce,iqm-gate-set-mapping"
 
 # Tell the rest-qpu that we are generating IQM JSON.
 # CHECK-DAG: CODEGEN_EMISSION=iqm
diff --git a/targettests/TargetConfig/RegressionValidation/oqc.config b/targettests/TargetConfig/RegressionValidation/oqc.config
index adbceff012..bd81dfe903 100644
--- a/targettests/TargetConfig/RegressionValidation/oqc.config
+++ b/targettests/TargetConfig/RegressionValidation/oqc.config
@@ -20,7 +20,7 @@
 # Define the lowering pipeline. Lucy has an 8-qubit ring topology, so mapping
 # uses ring(8).
 # Toshiko uses a Kagome lattice with 2-3 connectivity per qubit
-# CHECK-DAG: PLATFORM_LOWERING_CONFIG="const-prop-complex,canonicalize,cse,lift-array-value,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),oqc-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
+# CHECK-DAG: PLATFORM_LOWERING_CONFIG="func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),oqc-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
 
 
 # Tell the rest-qpu that we are generating QIR.
diff --git a/targettests/TargetConfig/RegressionValidation/quantinuum.config b/targettests/TargetConfig/RegressionValidation/quantinuum.config
index c899422b0f..d7f7c32d85 100644
--- a/targettests/TargetConfig/RegressionValidation/quantinuum.config
+++ b/targettests/TargetConfig/RegressionValidation/quantinuum.config
@@ -18,7 +18,7 @@
 # CHECK-DAG: LINKLIBS="${LINKLIBS} -lcudaq-rest-qpu"
 
 # Define the lowering pipeline, here we lower to Adaptive QIR
-# CHECK-DAG: PLATFORM_LOWERING_CONFIG="const-prop-complex,canonicalize,cse,lift-array-value,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),quantinuum-gate-set-mapping"
+# CHECK-DAG: PLATFORM_LOWERING_CONFIG="func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),quantinuum-gate-set-mapping"
 
 # Tell the rest-qpu that we are generating QIR.
 # CHECK-DAG: CODEGEN_EMISSION=qir-adaptive
diff --git a/test/AST-Quake/custom_op_concrete_matrix.cpp b/test/AST-Quake/custom_op_concrete_matrix.cpp
index f7a5b96734..5ab0313fab 100644
--- a/test/AST-Quake/custom_op_concrete_matrix.cpp
+++ b/test/AST-Quake/custom_op_concrete_matrix.cpp
@@ -6,7 +6,9 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// RUN: cudaq-quake %cpp_std %s | cudaq-opt -const-prop-complex -lift-array-value -get-concrete-matrix | FileCheck %s
+// clang-format off
+// RUN: cudaq-quake %cpp_std %s | cudaq-opt -const-prop-complex -lift-array-alloc -globalize-array-values -get-concrete-matrix | FileCheck %s
+// clang-format on
 
 #include <cudaq.h>
 
@@ -16,13 +18,13 @@ CUDAQ_REGISTER_OPERATION(custom_h, 1, 0,
 CUDAQ_REGISTER_OPERATION(custom_cnot, 2, 0,
                          {1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0})
 
-
 __qpu__ void kernel_1() {
   cudaq::qubit q, r;
   custom_h(q);
   custom_cnot(q, r);
 }
 
+// clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_kernel_1._Z8kernel_1v() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
 // CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
 // CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.ref
@@ -31,5 +33,5 @@ __qpu__ void kernel_1() {
 // CHECK:           return
 // CHECK:         }
 
-// CHECK:         cc.global constant @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_{{.*}}vectorId{{.*}}.rodata_{{[0-9]+}} (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (-0.70710678118654757,0.000000e+00)]> : tensor<4xcomplex<f64>>) : !cc.array<complex<f64> x 4>
-// CHECK:         cc.global constant @__nvqpp__mlirgen__function_custom_cnot_generator_2._Z23custom_cnot_generator_{{.*}}vectorId{{.*}}.rodata_{{[0-9]+}} (dense<[(1.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (1.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (1.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (1.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<16xcomplex<f64>>) : !cc.array<complex<f64> x 16>
+// CHECK-DAG:     cc.global constant @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_{{.*}}vectorId{{.*}}.rodata_{{[0-9]+}} (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (-0.70710678118654757,0.000000e+00)]> : tensor<4xcomplex<f64>>) : !cc.array<complex<f64> x 4>
+// CHECK-DAG:     cc.global constant @__nvqpp__mlirgen__function_custom_cnot_generator_2._Z23custom_cnot_generator_{{.*}}vectorId{{.*}}.rodata_{{[0-9]+}} (dense<[(1.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (1.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (1.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (1.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<16xcomplex<f64>>) : !cc.array<complex<f64> x 16>
diff --git a/test/Quake/lift_array.qke b/test/Quake/lift_array.qke
index 73a450d42c..b7cbcec5cd 100644
--- a/test/Quake/lift_array.qke
+++ b/test/Quake/lift_array.qke
@@ -6,7 +6,8 @@
 // the terms of the Apache License 2.0 which accompanies this distribution.   //
 // ========================================================================== //
 
-// RUN: cudaq-opt -lift-array-value %s | FileCheck %s
+// RUN: cudaq-opt -lift-array-alloc %s | FileCheck %s
+// RXN: cudaq-opt -lift-array-alloc -globalize-array-values %s | FileCheck --check-prefix=GLOBAL %s
 
 func.func @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
   %cst = complex.constant [0.707106769 : f32, 0.000000e+00 : f32] : complex<f32>
@@ -26,12 +27,21 @@ func.func @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_compl
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_0:.*]] = cc.address_of @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv.rodata_{{[0-9]+}} : !cc.ptr<!cc.array<complex<f32> x 4>>
-// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_2:.*]] = quake.init_state %[[VAL_1]], %[[VAL_0]] : (!quake.veq<2>, !cc.ptr<!cc.array<complex<f32> x 4>>) -> !quake.veq<2>
+// CHECK:           %[[VAL_0:.*]] = cc.const_array {{\[\[}}0.707106769 : f32, 0.000000e+00 : f32], [0.707106769 : f32, 0.000000e+00 : f32], [0.000000e+00 : f32, 0.000000e+00 : f32], [0.000000e+00 : f32, 0.000000e+00 : f32]] : !cc.array<complex<f32> x 4>
+// CHECK:           %[[VAL_1:.*]] = cc.alloca !cc.array<complex<f32> x 4>
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<!cc.array<complex<f32> x 4>>
+// CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+// CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_4:.*]] = quake.init_state %[[VAL_3]], %[[VAL_2]] : (!quake.veq<2>, !cc.ptr<complex<f32>>) -> !quake.veq<2>
 // CHECK:           return
 // CHECK:         }
 
+// GLOBAL-LABEL:   func.func @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// GLOBAL:           %[[VAL_0:.*]] = cc.address_of @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv.rodata_{{[0-9]+}} : !cc.ptr<!cc.array<complex<f32> x 4>>
+// GLOBAL:           %[[VAL_1:.*]] = quake.alloca !quake.veq<2>
+// GLOBAL:           %[[VAL_2:.*]] = quake.init_state %[[VAL_1]], %[[VAL_0]] : (!quake.veq<2>, !cc.ptr<!cc.array<complex<f32> x 4>>) -> !quake.veq<2>
+// GLOBAL:           return
+// GLOBAL:         }
 
 func.func private @__nvqpp_vectorCopyCtor(!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
 
@@ -58,13 +68,25 @@ func.func @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generato
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v() -> !cc.stdvec<complex<f64>> attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 16 : i64
 // CHECK:           %[[VAL_1:.*]] = arith.constant 4 : i64
-// CHECK:           %[[VAL_2:.*]] = cc.address_of @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v.rodata_{{[0-9]+}} : !cc.ptr<!cc.array<complex<f64> x 4>>
-// CHECK:           %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_4:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_3]], %[[VAL_1]], %[[VAL_0]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_5:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_1]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<complex<f64>>
-// CHECK:           return %[[VAL_5]] : !cc.stdvec<complex<f64>>
+// CHECK:           %[[VAL_2:.*]] = cc.const_array {{\[\[}}0.70710678118654757, 0.000000e+00], [0.70710678118654757, 0.000000e+00], [0.70710678118654757, 0.000000e+00], [-0.70710678118654757, 0.000000e+00]] : !cc.array<complex<f64> x 4>
+// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.array<complex<f64> x 4>
+// CHECK:           cc.store %[[VAL_2]], %[[VAL_3]] : !cc.ptr<!cc.array<complex<f64> x 4>>
+// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_5:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_4]], %[[VAL_1]], %[[VAL_0]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_6:.*]] = cc.stdvec_init %[[VAL_5]], %[[VAL_1]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<complex<f64>>
+// CHECK:           return %[[VAL_6]] : !cc.stdvec<complex<f64>>
 // CHECK:         }
 
+// GLOBAL-LABEL:   func.func @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v() -> !cc.stdvec<complex<f64>> attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// GLOBAL:           %[[VAL_0:.*]] = arith.constant 16 : i64
+// GLOBAL:           %[[VAL_1:.*]] = arith.constant 4 : i64
+// GLOBAL:           %[[VAL_2:.*]] = cc.address_of @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v.rodata_{{[0-9]+}} : !cc.ptr<!cc.array<complex<f64> x 4>>
+// GLOBAL:           %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<i8>
+// GLOBAL:           %[[VAL_4:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_3]], %[[VAL_1]], %[[VAL_0]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
+// GLOBAL:           %[[VAL_5:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_1]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<complex<f64>>
+// GLOBAL:           return %[[VAL_5]] : !cc.stdvec<complex<f64>>
+// GLOBAL:         }
+
 func.func @test2() -> !quake.veq<2> {
   %cst = arith.constant 9.000000e+00 : f64
   %cst_0 = arith.constant 6.000000e+00 : f64
@@ -85,12 +107,21 @@ func.func @test2() -> !quake.veq<2> {
 }
 
 // CHECK-LABEL:   func.func @test2() -> !quake.veq<2> {
-// CHECK:           %[[VAL_0:.*]] = cc.address_of @test2.rodata_{{[0-9]+}} : !cc.ptr<!cc.array<f64 x 4>>
-// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_2:.*]] = quake.init_state %[[VAL_1]], %[[VAL_0]] : (!quake.veq<2>, !cc.ptr<!cc.array<f64 x 4>>) -> !quake.veq<2>
-// CHECK:           return %[[VAL_2]] : !quake.veq<2>
+// CHECK:           %[[VAL_0:.*]] = cc.const_array [1.000000e+00, 2.000000e+00, 6.000000e+00, 9.000000e+00] : !cc.array<f64 x 4>
+// CHECK:           %[[VAL_1:.*]] = cc.alloca !cc.array<f64 x 4>
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<!cc.array<f64 x 4>>
+// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_3:.*]] = quake.init_state %[[VAL_2]], %[[VAL_1]] : (!quake.veq<2>, !cc.ptr<!cc.array<f64 x 4>>) -> !quake.veq<2>
+// CHECK:           return %[[VAL_3]] : !quake.veq<2>
 // CHECK:         }
 
-// CHECK-DAG:     cc.global constant @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv.rodata_{{[0-9]+}} (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<4xcomplex<f32>>) : !cc.array<complex<f32> x 4>
-// CHECK-DAG:     cc.global constant @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v.rodata_{{[0-9]+}} (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (-0.70710678118654757,0.000000e+00)]> : tensor<4xcomplex<f64>>) : !cc.array<complex<f64> x 4>
-// CHECK-DAG:     cc.global constant @test2.rodata_{{[0-9]+}} (dense<[1.000000e+00, 2.000000e+00, 6.000000e+00, 9.000000e+00]> : tensor<4xf64>) : !cc.array<f64 x 4>
+// GLOBAL-LABEL:   func.func @test2() -> !quake.veq<2> {
+// GLOBAL:           %[[VAL_0:.*]] = cc.address_of @test2.rodata_{{[0-9]+}} : !cc.ptr<!cc.array<f64 x 4>>
+// GLOBAL:           %[[VAL_1:.*]] = quake.alloca !quake.veq<2>
+// GLOBAL:           %[[VAL_2:.*]] = quake.init_state %[[VAL_1]], %[[VAL_0]] : (!quake.veq<2>, !cc.ptr<!cc.array<f64 x 4>>) -> !quake.veq<2>
+// GLOBAL:           return %[[VAL_2]] : !quake.veq<2>
+// GLOBAL:         }
+
+// GLOBAL-DAG:     cc.global constant @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv.rodata_{{[0-9]+}} (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<4xcomplex<f32>>) : !cc.array<complex<f32> x 4>
+// GLOBAL-DAG:     cc.global constant @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v.rodata_{{[0-9]+}} (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (-0.70710678118654757,0.000000e+00)]> : tensor<4xcomplex<f64>>) : !cc.array<complex<f64> x 4>
+// GLOBAL-DAG:     cc.global constant @test2.rodata_{{[0-9]+}} (dense<[1.000000e+00, 2.000000e+00, 6.000000e+00, 9.000000e+00]> : tensor<4xf64>) : !cc.array<f64 x 4>
diff --git a/tools/nvqpp/nvq++.in b/tools/nvqpp/nvq++.in
index 19dd84db7d..c585a69c89 100644
--- a/tools/nvqpp/nvq++.in
+++ b/tools/nvqpp/nvq++.in
@@ -712,7 +712,7 @@ if ${ENABLE_AGGRESSIVE_EARLY_INLINE}; then
 fi
 if ${ENABLE_DEVICE_CODE_LOADERS}; then
 	RUN_OPT=true
-	OPT_PASSES=$(add_pass_to_pipeline "${OPT_PASSES}" "func.func(quake-add-metadata),const-prop-complex,lift-array-value,func.func(get-concrete-matrix),device-code-loader")
+	OPT_PASSES=$(add_pass_to_pipeline "${OPT_PASSES}" "func.func(quake-add-metadata,const-prop-complex,lift-array-alloc),globalize-array-values,func.func(get-concrete-matrix),device-code-loader")
 fi
 if ${ENABLE_LOWER_TO_CFG}; then
 	RUN_OPT=true

From ea87205b97eb83bc927a2fac7eb725de18416108 Mon Sep 17 00:00:00 2001
From: Eric Schweitz <eschweitz@nvidia.com>
Date: Mon, 21 Oct 2024 08:37:50 -0700
Subject: [PATCH 2/2] Update strings per review comment.

Signed-off-by: Eric Schweitz <eschweitz@nvidia.com>
---
 lib/Optimizer/Transforms/ConstPropComplex.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/Optimizer/Transforms/ConstPropComplex.cpp b/lib/Optimizer/Transforms/ConstPropComplex.cpp
index 3434fc5ff5..3f2f65520f 100644
--- a/lib/Optimizer/Transforms/ConstPropComplex.cpp
+++ b/lib/Optimizer/Transforms/ConstPropComplex.cpp
@@ -179,15 +179,15 @@ class ConstPropComplexPass
                     FloatTruncatePattern, ComplexRePattern, ComplexImPattern>(
         ctx);
 
-    LLVM_DEBUG(llvm::dbgs()
-               << "Before lifting constant array: " << func << '\n');
+    LLVM_DEBUG(llvm::dbgs() << "Before constant propagation of complex values: "
+                            << func << '\n');
 
     if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
                                             std::move(patterns))))
       signalPassFailure();
 
-    LLVM_DEBUG(llvm::dbgs()
-               << "After lifting constant array: " << func << '\n');
+    LLVM_DEBUG(llvm::dbgs() << "After constant propagation of complex values: "
+                            << func << '\n');
   }
 };
 } // namespace