diff --git a/include/cudaq/Optimizer/Builder/Intrinsics.h b/include/cudaq/Optimizer/Builder/Intrinsics.h
index 30ab0e696a1..fa9ce53097f 100644
--- a/include/cudaq/Optimizer/Builder/Intrinsics.h
+++ b/include/cudaq/Optimizer/Builder/Intrinsics.h
@@ -94,41 +94,48 @@ class IRBuilder : public mlir::OpBuilder {
     return genCStringLiteral(loc, module, buffer);
   }
 
+  cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
+                                    llvm::StringRef name,
+                                    mlir::DenseElementsAttr values,
+                                    mlir::Type elementType);
+  cc::GlobalOp genVectorOfConstants(
+      mlir::Location loc, mlir::ModuleOp module, llvm::StringRef name,
+      const llvm::SmallVectorImpl<std::complex<double>> &values);
+  cc::GlobalOp genVectorOfConstants(
+      mlir::Location loc, mlir::ModuleOp module, llvm::StringRef name,
+      const llvm::SmallVectorImpl<std::complex<float>> &values);
+
   cc::GlobalOp
   genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
-                       mlir::StringRef name,
-                       const std::vector<std::complex<double>> &values);
-  cc::GlobalOp
-  genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
-                       mlir::StringRef name,
-                       const std::vector<std::complex<float>> &values);
-
+                       llvm::StringRef name,
+                       const llvm::SmallVectorImpl<double> &values);
   cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
-                                    mlir::StringRef name,
-                                    const std::vector<double> &values);
-  cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
-                                    mlir::StringRef name,
-                                    const std::vector<float> &values);
+                                    llvm::StringRef name,
+                                    const llvm::SmallVectorImpl<float> &values);
 
-  cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
-                                    mlir::StringRef name,
-                                    const std::vector<std::int64_t> &values);
+  cc::GlobalOp
+  genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
+                       llvm::StringRef name,
+                       const llvm::SmallVectorImpl<std::int64_t> &values);
 
-  cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
-                                    mlir::StringRef name,
-                                    const std::vector<std::int32_t> &values);
+  cc::GlobalOp
+  genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
+                       llvm::StringRef name,
+                       const llvm::SmallVectorImpl<std::int32_t> &values);
 
-  cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
-                                    mlir::StringRef name,
-                                    const std::vector<std::int16_t> &values);
+  cc::GlobalOp
+  genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
+                       llvm::StringRef name,
+                       const llvm::SmallVectorImpl<std::int16_t> &values);
 
-  cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
-                                    mlir::StringRef name,
-                                    const std::vector<std::int8_t> &values);
+  cc::GlobalOp
+  genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
+                       llvm::StringRef name,
+                       const llvm::SmallVectorImpl<std::int8_t> &values);
 
   cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
-                                    mlir::StringRef name,
-                                    const std::vector<bool> &values);
+                                    llvm::StringRef name,
+                                    const llvm::SmallVectorImpl<bool> &values);
 
   /// Load an intrinsic into \p module. The intrinsic to load has name \p name.
   /// This will automatically load any intrinsics that \p name depends upon.
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 9ca3810f395..da6f3163b3e 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -160,7 +160,7 @@ def CombineQuantumAllocations :
   let dependentDialects = ["cudaq::cc::CCDialect", "quake::QuakeDialect"];
 }
 
-def ConstPropComplex : Pass<"const-prop-complex", "mlir::ModuleOp"> {
+def ConstPropComplex : Pass<"const-prop-complex", "mlir::func::FuncOp"> {
   let summary = "Create and propagate complex constants.";
   let description = [{
     Rewrite the complex.CreateOp to complex.ConstantOp when possible.
@@ -383,11 +383,11 @@ def GenerateKernelExecution : Pass<"kernel-execution", "mlir::ModuleOp"> {
 
 def GetConcreteMatrix : Pass<"get-concrete-matrix", "mlir::func::FuncOp"> {
   let summary =
-    "Replace the unitary matrix generator function with concrete matrix.";
+    "Replace the unitary matrix generator function with a constant matrix.";
   let description = [{
     Given a custom operation whose generator attribute is another function 
     within the module, such that if `LiftArrayAlloc` pass has run, there will
-    be a global constant within the module which holds the concrete matrix 
+    be a global constant within the module which holds the constant matrix 
     representation for the custom operation. This pass will find that global
     variable and update the custom operation to directly point to it. 
 
@@ -424,6 +424,22 @@ def GetConcreteMatrix : Pass<"get-concrete-matrix", "mlir::func::FuncOp"> {
   }];
 }
 
+// GlobalizeArrayValues must be a module pass because it may promoted array
+// constants from functions to global constants (changes their scope).
+def GlobalizeArrayValues : Pass<"globalize-array-values", "mlir::ModuleOp"> {
+  let summary = "Convert const_array ops to globals.";
+  let description = [{
+    Often a `const_array` op can be canonicalized into scalar constants that
+    are then constant propagated to their uses in the quake ops. When this
+    happens, the `const_array` may become unused and be eliminated.
+
+    However, there can also be cases where the `const_array` remains alive, such
+    as when it is used in a `state_init` op. In such cases, we may be able to go
+    ahead and replace the `const_array` with a global constant. This pass makes
+    such conversions.
+  }];
+}
+
 // LambdaLifting is a module pass because it may modify the ModuleOp and add
 // new FuncOps.
 def LambdaLifting : Pass<"lambda-lifting", "mlir::ModuleOp"> {
@@ -439,7 +455,7 @@ def LambdaLifting : Pass<"lambda-lifting", "mlir::ModuleOp"> {
   let constructor = "cudaq::opt::createLambdaLiftingPass()";
 }
 
-def LiftArrayAlloc : Pass<"lift-array-value", "mlir::ModuleOp"> {
+def LiftArrayAlloc : Pass<"lift-array-alloc", "mlir::func::FuncOp"> {
   let summary = "Convert constant arrays built on the stack to array values";
   let description = [{
     The bridge or other passes may generate inline code to build an array of
@@ -476,6 +492,10 @@ def LiftArrayAlloc : Pass<"lift-array-value", "mlir::ModuleOp"> {
     updated or escapes the function, it cannot be replaced by a value. If
     it is elements are accessed in a read-only way, it is a legal transform
     and will enable further constant folding in other passes.
+
+    See the globalize array values pass for converting `const_array` values
+    to global constants. Conversion to globals is intentionally deferred to
+    allow constant propagation to take place correctly.
   }];
 
   let dependentDialects = ["mlir::complex::ComplexDialect"];
diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
index a6cc0ae4775..315743f057d 100644
--- a/lib/Optimizer/Builder/Intrinsics.cpp
+++ b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -450,16 +450,16 @@ LogicalResult IRBuilder::loadIntrinsic(ModuleOp module, StringRef intrinName) {
 }
 
 template <typename T>
-DenseElementsAttr createDenseElementsAttr(const std::vector<T> &values,
+DenseElementsAttr createDenseElementsAttr(const SmallVectorImpl<T> &values,
                                           Type eleTy) {
   auto newValues = ArrayRef<T>(values.data(), values.size());
   auto tensorTy = RankedTensorType::get(values.size(), eleTy);
   return DenseElementsAttr::get(tensorTy, newValues);
 }
 
-DenseElementsAttr createDenseElementsAttr(const std::vector<bool> &values,
-                                          Type eleTy) {
-  std::vector<std::byte> converted;
+static DenseElementsAttr
+createDenseElementsAttr(const SmallVectorImpl<bool> &values, Type eleTy) {
+  SmallVector<std::byte> converted;
   for (auto it = values.begin(); it != values.end(); it++) {
     bool value = *it;
     converted.push_back(std::byte(value));
@@ -470,83 +470,99 @@ DenseElementsAttr createDenseElementsAttr(const std::vector<bool> &values,
   return DenseElementsAttr::get(tensorTy, newValues);
 }
 
-template <typename A>
-cc::GlobalOp buildVectorOfConstantElements(Location loc, ModuleOp module,
-                                           StringRef name,
-                                           const std::vector<A> &values,
-                                           IRBuilder &builder, Type eleTy) {
+static cc::GlobalOp buildVectorOfConstantElements(Location loc, ModuleOp module,
+                                                  StringRef name,
+                                                  DenseElementsAttr &arrayAttr,
+                                                  IRBuilder &builder,
+                                                  Type eleTy) {
   if (auto glob = module.lookupSymbol<cc::GlobalOp>(name))
     return glob;
   auto *ctx = builder.getContext();
   OpBuilder::InsertionGuard guard(builder);
   builder.setInsertionPointToEnd(module.getBody());
-  auto globalTy = cc::ArrayType::get(ctx, eleTy, values.size());
-
-  auto arrayAttr = createDenseElementsAttr(values, eleTy);
+  auto globalTy = cc::ArrayType::get(ctx, eleTy, arrayAttr.size());
   return builder.create<cudaq::cc::GlobalOp>(loc, globalTy, name, arrayAttr,
                                              /*constant=*/true,
                                              /*external=*/false);
 }
 
+template <typename A>
+cc::GlobalOp buildVectorOfConstantElements(Location loc, ModuleOp module,
+                                           StringRef name,
+                                           const SmallVectorImpl<A> &values,
+                                           IRBuilder &builder, Type eleTy) {
+  auto arrayAttr = createDenseElementsAttr(values, eleTy);
+  return buildVectorOfConstantElements(loc, module, name, arrayAttr, builder,
+                                       eleTy);
+}
+
+cc::GlobalOp IRBuilder::genVectorOfConstants(Location loc, ModuleOp module,
+                                             StringRef name,
+                                             DenseElementsAttr values,
+                                             Type elementType) {
+  return buildVectorOfConstantElements(loc, module, name, values, *this,
+                                       elementType);
+}
+
 cc::GlobalOp IRBuilder::genVectorOfConstants(
     Location loc, ModuleOp module, StringRef name,
-    const std::vector<std::complex<double>> &values) {
+    const SmallVectorImpl<std::complex<double>> &values) {
   return buildVectorOfConstantElements(loc, module, name, values, *this,
                                        ComplexType::get(getF64Type()));
 }
 
 cc::GlobalOp IRBuilder::genVectorOfConstants(
     Location loc, ModuleOp module, StringRef name,
-    const std::vector<std::complex<float>> &values) {
+    const SmallVectorImpl<std::complex<float>> &values) {
   return buildVectorOfConstantElements(loc, module, name, values, *this,
                                        ComplexType::get(getF32Type()));
 }
 
 cc::GlobalOp
 IRBuilder::genVectorOfConstants(Location loc, ModuleOp module, StringRef name,
-                                const std::vector<double> &values) {
+                                const SmallVectorImpl<double> &values) {
   return buildVectorOfConstantElements(loc, module, name, values, *this,
                                        getF64Type());
 }
 
-cc::GlobalOp IRBuilder::genVectorOfConstants(Location loc, ModuleOp module,
-                                             StringRef name,
-                                             const std::vector<float> &values) {
+cc::GlobalOp
+IRBuilder::genVectorOfConstants(Location loc, ModuleOp module, StringRef name,
+                                const SmallVectorImpl<float> &values) {
   return buildVectorOfConstantElements(loc, module, name, values, *this,
                                        getF32Type());
 }
 
 cc::GlobalOp
 IRBuilder::genVectorOfConstants(Location loc, ModuleOp module, StringRef name,
-                                const std::vector<std::int64_t> &values) {
+                                const SmallVectorImpl<std::int64_t> &values) {
   return buildVectorOfConstantElements(loc, module, name, values, *this,
                                        getI64Type());
 }
 
 cc::GlobalOp
 IRBuilder::genVectorOfConstants(Location loc, ModuleOp module, StringRef name,
-                                const std::vector<std::int32_t> &values) {
+                                const SmallVectorImpl<std::int32_t> &values) {
   return buildVectorOfConstantElements(loc, module, name, values, *this,
                                        getI32Type());
 }
 
 cc::GlobalOp
 IRBuilder::genVectorOfConstants(Location loc, ModuleOp module, StringRef name,
-                                const std::vector<std::int16_t> &values) {
+                                const SmallVectorImpl<std::int16_t> &values) {
   return buildVectorOfConstantElements(loc, module, name, values, *this,
                                        getI16Type());
 }
 
 cc::GlobalOp
 IRBuilder::genVectorOfConstants(Location loc, ModuleOp module, StringRef name,
-                                const std::vector<std::int8_t> &values) {
+                                const SmallVectorImpl<std::int8_t> &values) {
   return buildVectorOfConstantElements(loc, module, name, values, *this,
                                        getI8Type());
 }
 
-cc::GlobalOp IRBuilder::genVectorOfConstants(Location loc, ModuleOp module,
-                                             StringRef name,
-                                             const std::vector<bool> &values) {
+cc::GlobalOp
+IRBuilder::genVectorOfConstants(Location loc, ModuleOp module, StringRef name,
+                                const SmallVectorImpl<bool> &values) {
   return buildVectorOfConstantElements(loc, module, name, values, *this,
                                        getI1Type());
 }
diff --git a/lib/Optimizer/CodeGen/Pipelines.cpp b/lib/Optimizer/CodeGen/Pipelines.cpp
index 247805fd2ba..9a46b0c5b0a 100644
--- a/lib/Optimizer/CodeGen/Pipelines.cpp
+++ b/lib/Optimizer/CodeGen/Pipelines.cpp
@@ -47,7 +47,8 @@ void cudaq::opt::addPipelineTranslateToOpenQASM(PassManager &pm) {
   pm.addNestedPass<func::FuncOp>(createClassicalMemToReg());
   pm.addPass(createLoopUnroll());
   pm.addPass(createCanonicalizerPass());
-  pm.addPass(createLiftArrayAlloc());
+  pm.addNestedPass<func::FuncOp>(createLiftArrayAlloc());
+  pm.addPass(createGlobalizeArrayValues());
   pm.addPass(createStatePreparation());
 }
 
diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
index a6b94d9a596..d906b749e66 100644
--- a/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -30,6 +30,7 @@ add_cudaq_library(OptTransforms
   GenKernelExecution.cpp
   GenDeviceCodeLoader.cpp
   GetConcreteMatrix.cpp
+  GlobalizeArrayValues.cpp
   LambdaLifting.cpp
   LiftArrayAlloc.cpp
   LinearCtrlRelations.cpp
diff --git a/lib/Optimizer/Transforms/ConstPropComplex.cpp b/lib/Optimizer/Transforms/ConstPropComplex.cpp
index 939634bf836..3434fc5ff5c 100644
--- a/lib/Optimizer/Transforms/ConstPropComplex.cpp
+++ b/lib/Optimizer/Transforms/ConstPropComplex.cpp
@@ -172,29 +172,22 @@ class ConstPropComplexPass
 
   void runOnOperation() override {
     auto *ctx = &getContext();
-    auto module = getOperation();
-    for (Operation &op : *module.getBody()) {
-      auto func = dyn_cast<func::FuncOp>(op);
-      if (!func)
-        continue;
-      DominanceInfo domInfo(func);
-      std::string funcName = func.getName().str();
-      RewritePatternSet patterns(ctx);
-      patterns
-          .insert<ComplexCreatePattern, FloatCastPattern, FloatExtendPattern,
-                  FloatTruncatePattern, ComplexRePattern, ComplexImPattern>(
-              ctx);
-
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Before lifting constant array: " << func << '\n');
-
-      if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
-                                              std::move(patterns))))
-        signalPassFailure();
-
-      LLVM_DEBUG(llvm::dbgs()
-                 << "After lifting constant array: " << func << '\n');
-    }
+    auto func = getOperation();
+    DominanceInfo domInfo(func);
+    RewritePatternSet patterns(ctx);
+    patterns.insert<ComplexCreatePattern, FloatCastPattern, FloatExtendPattern,
+                    FloatTruncatePattern, ComplexRePattern, ComplexImPattern>(
+        ctx);
+
+    LLVM_DEBUG(llvm::dbgs()
+               << "Before lifting constant array: " << func << '\n');
+
+    if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
+                                            std::move(patterns))))
+      signalPassFailure();
+
+    LLVM_DEBUG(llvm::dbgs()
+               << "After lifting constant array: " << func << '\n');
   }
 };
 } // namespace
diff --git a/lib/Optimizer/Transforms/GlobalizeArrayValues.cpp b/lib/Optimizer/Transforms/GlobalizeArrayValues.cpp
new file mode 100644
index 00000000000..41fe47445d7
--- /dev/null
+++ b/lib/Optimizer/Transforms/GlobalizeArrayValues.cpp
@@ -0,0 +1,138 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "PassDetails.h"
+#include "cudaq/Optimizer/Builder/Intrinsics.h"
+#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+#include "cudaq/Optimizer/Transforms/Passes.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/Passes.h"
+
+namespace cudaq::opt {
+#define GEN_PASS_DEF_GLOBALIZEARRAYVALUES
+#include "cudaq/Optimizer/Transforms/Passes.h.inc"
+} // namespace cudaq::opt
+
+#define DEBUG_TYPE "globalize-array-values"
+
+using namespace mlir;
+
+template <typename A, typename B>
+SmallVector<A> conversion(ArrayAttr seq) {
+  SmallVector<A> result;
+  for (auto v : seq) {
+    B c = cast<B>(v);
+    result.emplace_back(c.getValue());
+  }
+  return result;
+}
+template <>
+SmallVector<std::complex<APFloat>>
+conversion<std::complex<APFloat>, ArrayAttr>(ArrayAttr seq) {
+  SmallVector<std::complex<APFloat>> result;
+  for (auto v : seq) {
+    auto p = cast<ArrayAttr>(v);
+    result.emplace_back(cast<FloatAttr>(p[0]).getValue(),
+                        cast<FloatAttr>(p[1]).getValue());
+  }
+  return result;
+}
+
+static LogicalResult
+convertArrayAttrToGlobalConstant(MLIRContext *ctx, Location loc,
+                                 ArrayAttr arrAttr, ModuleOp module,
+                                 StringRef globalName, Type eleTy) {
+  cudaq::IRBuilder irBuilder(ctx);
+  auto tensorTy = RankedTensorType::get(arrAttr.size(), eleTy);
+  if (isa<ComplexType>(eleTy)) {
+    auto blockValues = conversion<std::complex<APFloat>, ArrayAttr>(arrAttr);
+    auto dense = DenseElementsAttr::get(tensorTy, blockValues);
+    irBuilder.genVectorOfConstants(loc, module, globalName, dense, eleTy);
+  } else if (isa<FloatType>(eleTy)) {
+    auto blockValues = conversion<APFloat, FloatAttr>(arrAttr);
+    auto dense = DenseElementsAttr::get(tensorTy, blockValues);
+    irBuilder.genVectorOfConstants(loc, module, globalName, dense, eleTy);
+  } else if (isa<IntegerType>(eleTy)) {
+    auto blockValues = conversion<APInt, IntegerAttr>(arrAttr);
+    auto dense = DenseElementsAttr::get(tensorTy, blockValues);
+    irBuilder.genVectorOfConstants(loc, module, globalName, dense, eleTy);
+  } else {
+    return failure();
+  }
+  return success();
+}
+
+namespace {
+struct ConstantArrayPattern
+    : public OpRewritePattern<cudaq::cc::ConstantArrayOp> {
+  explicit ConstantArrayPattern(MLIRContext *ctx, ModuleOp module,
+                                unsigned &counter)
+      : OpRewritePattern{ctx}, module{module}, counter{counter} {}
+
+  LogicalResult matchAndRewrite(cudaq::cc::ConstantArrayOp conarr,
+                                PatternRewriter &rewriter) const override {
+    if (!conarr->hasOneUse())
+      return failure();
+    auto store = dyn_cast<cudaq::cc::StoreOp>(*conarr->getUsers().begin());
+    if (!store)
+      return failure();
+    auto alloca = store.getPtrvalue().getDefiningOp<cudaq::cc::AllocaOp>();
+    if (!alloca)
+      return failure();
+    auto func = conarr->getParentOfType<func::FuncOp>();
+    if (!func)
+      return failure();
+    std::string globalName =
+        func.getName().str() + ".rodata_" + std::to_string(counter++);
+    auto *ctx = rewriter.getContext();
+    auto valueAttr = conarr.getConstantValues();
+    auto eleTy = cast<cudaq::cc::ArrayType>(conarr.getType()).getElementType();
+    if (failed(convertArrayAttrToGlobalConstant(ctx, conarr.getLoc(), valueAttr,
+                                                module, globalName, eleTy)))
+      return failure();
+    rewriter.replaceOpWithNewOp<cudaq::cc::AddressOfOp>(
+        alloca, alloca.getType(), globalName);
+    rewriter.eraseOp(store);
+    rewriter.eraseOp(conarr);
+    return success();
+  }
+
+  ModuleOp module;
+  unsigned &counter;
+};
+
+class GlobalizeArrayValuesPass
+    : public cudaq::opt::impl::GlobalizeArrayValuesBase<
+          GlobalizeArrayValuesPass> {
+public:
+  using GlobalizeArrayValuesBase::GlobalizeArrayValuesBase;
+
+  void runOnOperation() override {
+    auto *ctx = &getContext();
+    ModuleOp module = getOperation();
+
+    // Make the unchecked assumption that a ConstArrayOp was added by the
+    // LiftArrayAlloc pass. This assumption means that the backing store of the
+    // ConstArrayOp has been checked that it is never written to.
+    RewritePatternSet patterns(ctx);
+    unsigned counter = 0;
+    patterns.insert<ConstantArrayPattern>(ctx, module, counter);
+    LLVM_DEBUG(llvm::dbgs() << "Before globalizing array values:\n"
+                            << module << '\n');
+    if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
+      signalPassFailure();
+    LLVM_DEBUG(llvm::dbgs() << "After globalizing array values:\n"
+                            << module << '\n');
+  }
+};
+} // namespace
diff --git a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
index 689be49998d..ba36d65d0eb 100644
--- a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
+++ b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
@@ -27,80 +27,16 @@ namespace cudaq::opt {
 
 using namespace mlir;
 
-namespace {
-template <typename A>
-std::vector<A> readConstantValues(SmallVectorImpl<Attribute> &vec, Type eleTy) {
-  std::vector<A> result;
-  for (auto a : vec) {
-    if constexpr (std::is_same_v<A, std::complex<double>>) {
-      auto v = cast<ArrayAttr>(a);
-      result.emplace_back(cast<FloatAttr>(v[0]).getValue().convertToDouble(),
-                          cast<FloatAttr>(v[1]).getValue().convertToDouble());
-    } else if constexpr (std::is_same_v<A, std::complex<float>>) {
-      auto v = cast<ArrayAttr>(a);
-      result.emplace_back(cast<FloatAttr>(v[0]).getValue().convertToFloat(),
-                          cast<FloatAttr>(v[1]).getValue().convertToFloat());
-    } else if constexpr (std::is_same_v<A, double>) {
-      auto v = cast<FloatAttr>(a);
-      result.emplace_back(v.getValue().convertToDouble());
-    } else if constexpr (std::is_same_v<A, float>) {
-      auto v = cast<FloatAttr>(a);
-      result.emplace_back(v.getValue().convertToFloat());
-    }
-  }
-  return result;
-}
-
-LogicalResult genVectorOfConstantsFromAttributes(cudaq::IRBuilder irBuilder,
-                                                 Location loc, ModuleOp module,
-                                                 StringRef name,
-                                                 SmallVector<Attribute> &values,
-                                                 Type eleTy) {
-
-  if (auto cTy = dyn_cast<ComplexType>(eleTy)) {
-    auto floatTy = cTy.getElementType();
-    if (floatTy == irBuilder.getF64Type()) {
-      auto vals = readConstantValues<std::complex<double>>(values, cTy);
-      if (vals.size() == values.size()) {
-        irBuilder.genVectorOfConstants(loc, module, name, vals);
-        return success();
-      }
-    } else if (floatTy == irBuilder.getF32Type()) {
-      auto vals = readConstantValues<std::complex<float>>(values, cTy);
-      if (vals.size() == values.size()) {
-        irBuilder.genVectorOfConstants(loc, module, name, vals);
-        return success();
-      }
-    }
-  } else if (eleTy == irBuilder.getF64Type()) {
-    auto vals = readConstantValues<double>(values, eleTy);
-    if (vals.size() == values.size()) {
-      irBuilder.genVectorOfConstants(loc, module, name, vals);
-      return success();
-    }
-  } else if (eleTy == irBuilder.getF32Type()) {
-    auto vals = readConstantValues<float>(values, eleTy);
-    if (vals.size() == values.size()) {
-      irBuilder.genVectorOfConstants(loc, module, name, vals);
-      return success();
-    }
-  }
-  return failure();
-}
-} // namespace
-
 namespace {
 class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
 public:
-  explicit AllocaPattern(MLIRContext *ctx, DominanceInfo &di,
-                         const std::string &fn, ModuleOp m)
-      : OpRewritePattern(ctx), dom(di), funcName(fn), module(m) {}
+  explicit AllocaPattern(MLIRContext *ctx, DominanceInfo &di, StringRef fn)
+      : OpRewritePattern(ctx), dom(di), funcName(fn) {}
 
   LogicalResult matchAndRewrite(cudaq::cc::AllocaOp alloc,
                                 PatternRewriter &rewriter) const override {
     SmallVector<Operation *> stores;
-    bool toGlobal = false;
-    if (!isGoodCandidate(alloc, stores, dom, toGlobal))
+    if (!isGoodCandidate(alloc, stores, dom))
       return failure();
 
     LLVM_DEBUG(llvm::dbgs() << "Candidate was found\n");
@@ -124,79 +60,58 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
     auto eleTy = arrTy.getElementType();
     auto valuesAttr = rewriter.getArrayAttr(values);
     auto loc = alloc.getLoc();
-    Value conArr;
-    Value conGlobal;
-    if (toGlobal) {
-      static unsigned counter = 0;
-      auto ptrTy = cudaq::cc::PointerType::get(arrTy);
-      // Build a new name based on the kernel name.
-      std::string name = funcName + ".rodata_" + std::to_string(counter++);
-      cudaq::IRBuilder irBuilder(rewriter.getContext());
-      if (succeeded(genVectorOfConstantsFromAttributes(irBuilder, loc, module,
-                                                       name, values, eleTy))) {
-        conGlobal = rewriter.create<cudaq::cc::AddressOfOp>(loc, ptrTy, name);
-        conArr = rewriter.create<cudaq::cc::LoadOp>(loc, arrTy, conGlobal);
-      } else {
-        conArr =
-            rewriter.create<cudaq::cc::ConstantArrayOp>(loc, arrTy, valuesAttr);
-      }
-    } else {
-      conArr =
-          rewriter.create<cudaq::cc::ConstantArrayOp>(loc, arrTy, valuesAttr);
-    }
+    Value conArr =
+        rewriter.create<cudaq::cc::ConstantArrayOp>(loc, arrTy, valuesAttr);
 
     assert(conArr && "must have created the constant array");
     LLVM_DEBUG(llvm::dbgs() << "constant array is:\n" << conArr << '\n');
     bool cannotEraseAlloc = false;
 
+    // Collect all the stores, casts, and compute_ptr to be erased safely and in
+    // topological order.
+    SmallVector<Operation *> opsToErase;
+    auto insertOpToErase = [&](Operation *op) {
+      auto iter = std::find(opsToErase.begin(), opsToErase.end(), op);
+      if (iter == opsToErase.end())
+        opsToErase.push_back(op);
+    };
+
     // Rewalk all the uses of alloc, u, which must be cc.cast or cc.compute_ptr.
-    // For each,u, remove a store and replace a load with a cc.extract_value.
-    for (auto &use : alloc->getUses()) {
-      auto *user = use.getOwner();
+    // For each u remove a store and replace a load with a cc.extract_value.
+    for (auto *user : alloc->getUsers()) {
+      if (!user)
+        continue;
       std::int32_t offset = 0;
       if (auto cptr = dyn_cast<cudaq::cc::ComputePtrOp>(user))
         offset = cptr.getRawConstantIndices()[0];
       bool isLive = false;
-      for (auto &useuse : user->getUses()) {
-        auto *useuser = useuse.getOwner();
-        if (auto ist = dyn_cast<quake::InitializeStateOp>(useuser)) {
-          rewriter.setInsertionPointAfter(useuser);
-          LLVM_DEBUG(llvm::dbgs() << "replaced init_state\n");
-          assert(conGlobal && "global must be defined");
-          rewriter.replaceOpWithNewOp<quake::InitializeStateOp>(
-              ist, ist.getType(), ist.getTargets(), conGlobal);
-          continue;
-        }
-        if (auto load = dyn_cast<cudaq::cc::LoadOp>(useuser)) {
-          rewriter.setInsertionPointAfter(useuser);
-          LLVM_DEBUG(llvm::dbgs() << "replaced load\n");
-          rewriter.replaceOpWithNewOp<cudaq::cc::ExtractValueOp>(
-              load, eleTy, conArr,
-              ArrayRef<cudaq::cc::ExtractValueArg>{offset});
-          continue;
-        }
-        if (isa<cudaq::cc::StoreOp>(useuser))
-          rewriter.eraseOp(useuser);
-        LLVM_DEBUG(llvm::dbgs() << "alloc is live\n");
+      if (!isa<cudaq::cc::CastOp, cudaq::cc::ComputePtrOp>(user)) {
         cannotEraseAlloc = isLive = true;
-      }
-      if (auto ist = dyn_cast<quake::InitializeStateOp>(user)) {
-        rewriter.setInsertionPointAfter(user);
-        LLVM_DEBUG(llvm::dbgs() << "replaced init_state\n");
-        assert(conGlobal && "global must be defined");
-        rewriter.replaceOpWithNewOp<quake::InitializeStateOp>(
-            ist, ist.getType(), ist.getTargets(), conGlobal);
-        continue;
+      } else {
+        for (auto *useuser : user->getUsers()) {
+          if (auto load = dyn_cast<cudaq::cc::LoadOp>(useuser)) {
+            rewriter.setInsertionPointAfter(useuser);
+            LLVM_DEBUG(llvm::dbgs() << "replaced load\n");
+            rewriter.replaceOpWithNewOp<cudaq::cc::ExtractValueOp>(
+                load, eleTy, conArr,
+                ArrayRef<cudaq::cc::ExtractValueArg>{offset});
+            continue;
+          }
+          if (isa<cudaq::cc::StoreOp>(useuser)) {
+            insertOpToErase(useuser);
+            continue;
+          }
+          LLVM_DEBUG(llvm::dbgs() << "alloc is live\n");
+          cannotEraseAlloc = isLive = true;
+        }
       }
       if (!isLive)
-        rewriter.eraseOp(user);
+        insertOpToErase(user);
     }
 
-    if (toGlobal && conGlobal) {
-      rewriter.setInsertionPointAfter(alloc);
-      rewriter.replaceOp(alloc, conGlobal);
-      return success();
-    }
+    for (auto *e : opsToErase)
+      rewriter.eraseOp(e);
+
     if (cannotEraseAlloc) {
       rewriter.setInsertionPointAfter(alloc);
       rewriter.create<cudaq::cc::StoreOp>(loc, conArr, alloc);
@@ -210,14 +125,11 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
   // array value. \p scoreboard is a vector of store operations. Each element of
   // the allocated array must be written to exactly 1 time, and the scoreboard
   // is used to track these stores. \p dom is the dominance info for this
-  // function (to ensure the stores happen before uses). \p toGlobal is returned
-  // as a result. If it is `true`, then the constant array shall be lowered to a
-  // global variable rather than an inline constant array.
+  // function (to ensure the stores happen before uses).
   static bool isGoodCandidate(cudaq::cc::AllocaOp alloc,
                               SmallVectorImpl<Operation *> &scoreboard,
-                              DominanceInfo &dom, bool &toGlobal) {
+                              DominanceInfo &dom) {
     LLVM_DEBUG(llvm::dbgs() << "checking candidate\n");
-    toGlobal = false;
     if (alloc.getSeqSize())
       return false;
     auto arrTy = dyn_cast<cudaq::cc::ArrayType>(alloc.getElementType());
@@ -262,7 +174,6 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
         }
         if (isa<quake::InitializeStateOp>(u)) {
           toGlobalUses.push_back(u);
-          toGlobal = true;
           continue;
         }
         if (isa<cudaq::cc::LoadOp>(u)) {
@@ -311,7 +222,6 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
             Operation *u = use.getOwner();
             if (isa_and_present<quake::InitializeStateOp>(u)) {
               toGlobalUses.push_back(op);
-              toGlobal = true;
               continue;
             }
           }
@@ -319,17 +229,14 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
         }
         LLVM_DEBUG(llvm::dbgs() << "unexpected cast: " << *op << '\n');
         toGlobalUses.push_back(op);
-        toGlobal = true;
         continue;
       }
       if (isa<quake::InitializeStateOp>(op)) {
         toGlobalUses.push_back(op);
-        toGlobal = true;
         continue;
       }
       LLVM_DEBUG(llvm::dbgs() << "unexpected use: " << *op << '\n');
       toGlobalUses.push_back(op);
-      toGlobal = true;
     }
 
     bool ok = std::all_of(scoreboard.begin(), scoreboard.end(),
@@ -365,8 +272,7 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
   }
 
   DominanceInfo &dom;
-  const std::string &funcName;
-  mutable ModuleOp module;
+  StringRef funcName;
 };
 
 class LiftArrayAllocPass
@@ -376,26 +282,20 @@ class LiftArrayAllocPass
 
   void runOnOperation() override {
     auto *ctx = &getContext();
-    auto module = getOperation();
-    for (Operation &op : *module.getBody()) {
-      auto func = dyn_cast<func::FuncOp>(op);
-      if (!func)
-        continue;
-      DominanceInfo domInfo(func);
-      std::string funcName = func.getName().str();
-      RewritePatternSet patterns(ctx);
-      patterns.insert<AllocaPattern>(ctx, domInfo, funcName, module);
+    auto func = getOperation();
+    DominanceInfo domInfo(func);
+    StringRef funcName = func.getName();
+    RewritePatternSet patterns(ctx);
+    patterns.insert<AllocaPattern>(ctx, domInfo, funcName);
 
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Before lifting constant array: " << func << '\n');
+    LLVM_DEBUG(llvm::dbgs()
+               << "Before lifting constant array: " << func << '\n');
 
-      if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
-                                              std::move(patterns))))
-        signalPassFailure();
+    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
+      signalPassFailure();
 
-      LLVM_DEBUG(llvm::dbgs()
-                 << "After lifting constant array: " << func << '\n');
-    }
+    LLVM_DEBUG(llvm::dbgs()
+               << "After lifting constant array: " << func << '\n');
   }
 };
 } // namespace
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index adbe9df29e0..16d874559fe 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -118,7 +118,7 @@ static bool hasInitStateUse(BlockArgument argument) {
 template <typename ELETY, typename T, typename ATTR, typename MAKER>
 LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
-                         BlockArgument argument, std::vector<T> &vec,
+                         BlockArgument argument, SmallVectorImpl<T> &vec,
                          ATTR arrayAttr, MAKER makeElementValue) {
   auto *ctx = builder.getContext();
   auto argTy = argument.getType();
@@ -273,8 +273,8 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
 }
 
 template <typename A>
-std::vector<std::int32_t> asI32(const std::vector<A> &v) {
-  std::vector<std::int32_t> result(v.size());
+SmallVector<std::int32_t> asI32(const SmallVectorImpl<A> &v) {
+  SmallVector<std::int32_t> result(v.size());
   for (auto iter : llvm::enumerate(v))
     result[iter.index()] = static_cast<std::int32_t>(iter.value());
   return result;
@@ -284,51 +284,47 @@ std::vector<std::int32_t> asI32(const std::vector<A> &v) {
 // the output of the constant array op.
 static LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
-                         BlockArgument argument, std::vector<bool> &vec) {
+                         BlockArgument argument, SmallVectorImpl<bool> &vec) {
   auto arrayAttr = builder.getI32ArrayAttr(asI32(vec));
   return synthesizeVectorArgument<IntegerType>(builder, module, counter,
                                                argument, vec, arrayAttr,
                                                makeIntegerElement<bool>);
 }
 
-static LogicalResult synthesizeVectorArgument(OpBuilder &builder,
-                                              ModuleOp module,
-                                              unsigned &counter,
-                                              BlockArgument argument,
-                                              std::vector<std::int8_t> &vec) {
+static LogicalResult
+synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
+                         BlockArgument argument,
+                         SmallVectorImpl<std::int8_t> &vec) {
   auto arrayAttr = builder.getI32ArrayAttr(asI32(vec));
   return synthesizeVectorArgument<IntegerType>(builder, module, counter,
                                                argument, vec, arrayAttr,
                                                makeIntegerElement<std::int8_t>);
 }
 
-static LogicalResult synthesizeVectorArgument(OpBuilder &builder,
-                                              ModuleOp module,
-                                              unsigned &counter,
-                                              BlockArgument argument,
-                                              std::vector<std::int16_t> &vec) {
+static LogicalResult
+synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
+                         BlockArgument argument,
+                         SmallVectorImpl<std::int16_t> &vec) {
   auto arrayAttr = builder.getI32ArrayAttr(asI32(vec));
   return synthesizeVectorArgument<IntegerType>(
       builder, module, counter, argument, vec, arrayAttr,
       makeIntegerElement<std::int16_t>);
 }
 
-static LogicalResult synthesizeVectorArgument(OpBuilder &builder,
-                                              ModuleOp module,
-                                              unsigned &counter,
-                                              BlockArgument argument,
-                                              std::vector<std::int32_t> &vec) {
+static LogicalResult
+synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
+                         BlockArgument argument,
+                         SmallVectorImpl<std::int32_t> &vec) {
   auto arrayAttr = builder.getI32ArrayAttr(vec);
   return synthesizeVectorArgument<IntegerType>(
       builder, module, counter, argument, vec, arrayAttr,
       makeIntegerElement<std::int32_t>);
 }
 
-static LogicalResult synthesizeVectorArgument(OpBuilder &builder,
-                                              ModuleOp module,
-                                              unsigned &counter,
-                                              BlockArgument argument,
-                                              std::vector<std::int64_t> &vec) {
+static LogicalResult
+synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
+                         BlockArgument argument,
+                         SmallVectorImpl<std::int64_t> &vec) {
   auto arrayAttr = builder.getI64ArrayAttr(vec);
   return synthesizeVectorArgument<IntegerType>(
       builder, module, counter, argument, vec, arrayAttr,
@@ -337,7 +333,7 @@ static LogicalResult synthesizeVectorArgument(OpBuilder &builder,
 
 static LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
-                         BlockArgument argument, std::vector<float> &vec) {
+                         BlockArgument argument, SmallVectorImpl<float> &vec) {
   auto arrayAttr = builder.getF32ArrayAttr(vec);
   return synthesizeVectorArgument<FloatType>(builder, module, counter, argument,
                                              vec, arrayAttr,
@@ -346,7 +342,7 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
 
 static LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
-                         BlockArgument argument, std::vector<double> &vec) {
+                         BlockArgument argument, SmallVectorImpl<double> &vec) {
   auto arrayAttr = builder.getF64ArrayAttr(vec);
   return synthesizeVectorArgument<FloatType>(builder, module, counter, argument,
                                              vec, arrayAttr,
@@ -356,8 +352,8 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
 static LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
                          BlockArgument argument,
-                         std::vector<std::complex<float>> &vec) {
-  std::vector<float> vec2;
+                         SmallVectorImpl<std::complex<float>> &vec) {
+  SmallVector<float> vec2;
   for (auto c : vec) {
     vec2.push_back(c.real());
     vec2.push_back(c.imag());
@@ -371,8 +367,8 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
 static LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
                          BlockArgument argument,
-                         std::vector<std::complex<double>> &vec) {
-  std::vector<double> vec2;
+                         SmallVectorImpl<std::complex<double>> &vec) {
+  SmallVector<double> vec2;
   for (auto c : vec) {
     vec2.push_back(c.real());
     vec2.push_back(c.imag());
@@ -410,7 +406,7 @@ class QuakeSynthesizer
 
   mlir::ModuleOp getModule() { return getOperation(); }
 
-  std::pair<std::size_t, std::vector<std::size_t>>
+  std::pair<std::size_t, SmallVector<std::size_t>>
   getTargetLayout(FunctionType funcTy) {
     auto bufferTy =
         cudaq::opt::factory::buildInvokeStructType(funcTy, startingArgIdx);
@@ -429,7 +425,7 @@ class QuakeSynthesizer
         cast<llvm::StructType>(translator.translateType(llvmDialectTy));
     auto *layout = dataLayout.getStructLayout(llvmStructTy);
     auto strSize = layout->getSizeInBytes();
-    std::vector<std::size_t> fieldOffsets;
+    SmallVector<std::size_t> fieldOffsets;
     for (std::size_t i = 0, I = bufferTy.getMembers().size(); i != I; ++i)
       fieldOffsets.emplace_back(layout->getElementOffset(i));
     return {strSize, fieldOffsets};
@@ -462,7 +458,7 @@ class QuakeSynthesizer
     auto arguments = funcOp.getArguments();
     auto structLayout = getTargetLayout(funcOp.getFunctionType());
     // Keep track of the stdVec sizes.
-    std::vector<std::tuple<std::size_t, Type, std::uint64_t>> stdVecInfo;
+    SmallVector<std::tuple<std::size_t, Type, std::uint64_t>> stdVecInfo;
 
     for (std::size_t argNum = startingArgIdx, end = arguments.size();
          argNum < end; argNum++) {
@@ -656,7 +652,7 @@ class QuakeSynthesizer
       }
       auto doVector = [&]<typename T>(T) {
         auto *ptr = reinterpret_cast<const T *>(bufferAppendix);
-        std::vector<T> v(ptr, ptr + vecLength);
+        SmallVector<T> v(ptr, ptr + vecLength);
         if (failed(synthesizeVectorArgument(builder, module, counter,
                                             arguments[idx], v)))
           funcOp.emitOpError("synthesis failed for vector<T>");
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 0d3f44c8327..475093ba24d 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -533,23 +533,24 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
   PassManager pm(context);
   pm.addNestedPass<func::FuncOp>(
       cudaq::opt::createArgumentSynthesisPass(kernels, substs));
-  pm.addPass(createCanonicalizerPass());
+  pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
 
   // Run state preparation for quantum devices (or their emulation) only.
   // Simulators have direct implementation of state initialization
   // in their runtime.
   if (!isSimulator) {
-    pm.addPass(cudaq::opt::createConstPropComplex());
-    pm.addPass(cudaq::opt::createLiftArrayAlloc());
+    pm.addNestedPass<func::FuncOp>(cudaq::opt::createConstPropComplex());
+    pm.addNestedPass<func::FuncOp>(cudaq::opt::createLiftArrayAlloc());
+    pm.addPass(cudaq::opt::createGlobalizeArrayValues());
     pm.addPass(cudaq::opt::createStatePreparation());
   }
-  pm.addPass(createCanonicalizerPass());
-  pm.addPass(cudaq::opt::createExpandMeasurementsPass());
+  pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+  pm.addNestedPass<func::FuncOp>(cudaq::opt::createExpandMeasurementsPass());
   pm.addNestedPass<func::FuncOp>(cudaq::opt::createClassicalMemToReg());
-  pm.addPass(createCanonicalizerPass());
-  pm.addPass(cudaq::opt::createLoopNormalize());
-  pm.addPass(cudaq::opt::createLoopUnroll());
-  pm.addPass(createCanonicalizerPass());
+  pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+  pm.addNestedPass<func::FuncOp>(cudaq::opt::createLoopNormalize());
+  pm.addNestedPass<func::FuncOp>(cudaq::opt::createLoopUnroll());
+  pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
   DefaultTimingManager tm;
   tm.setEnabled(cudaq::isTimingTagEnabled(cudaq::TIMING_JIT_PASSES));
   auto timingScope = tm.getRootScope(); // starts the timer
diff --git a/python/runtime/mlir/py_register_dialects.cpp b/python/runtime/mlir/py_register_dialects.cpp
index 3dd5a66ff30..9c0c4f2985e 100644
--- a/python/runtime/mlir/py_register_dialects.cpp
+++ b/python/runtime/mlir/py_register_dialects.cpp
@@ -318,12 +318,15 @@ void bindRegisterDialects(py::module &mod) {
     mlirContext->loadAllAvailableDialects();
   });
 
-  mod.def("gen_vector_of_complex_constant",
-          [](MlirLocation loc, MlirModule module, std::string name,
-             const std::vector<std::complex<double>> &values) {
-            ModuleOp modOp = unwrap(module);
-            cudaq::IRBuilder builder = IRBuilder::atBlockEnd(modOp.getBody());
-            builder.genVectorOfConstants(unwrap(loc), modOp, name, values);
-          });
+  mod.def("gen_vector_of_complex_constant", [](MlirLocation loc,
+                                               MlirModule module,
+                                               std::string name,
+                                               const std::vector<std::complex<
+                                                   double>> &values) {
+    ModuleOp modOp = unwrap(module);
+    cudaq::IRBuilder builder = IRBuilder::atBlockEnd(modOp.getBody());
+    SmallVector<std::complex<double>> newValues{values.begin(), values.end()};
+    builder.genVectorOfConstants(unwrap(loc), modOp, name, newValues);
+  });
 }
 } // namespace cudaq
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index 424cbd8873d..0de2589752f 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -123,7 +123,7 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
 
     cudaq::IRBuilder irBuilder(ctx);
     auto genConArray = [&]<typename T>() -> Value {
-      std::vector<std::complex<T>> vec(size);
+      SmallVector<std::complex<T>> vec(size);
       for (std::size_t i = 0; i < size; i++) {
         vec[i] = (*v)({i}, 0);
       }
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index c65e94c3b6e..641b4457476 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -399,7 +399,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     for (auto &op : m_module.getOps()) {
       // Add any global symbols, including global constant arrays.
       // Global constant arrays can be created during compilation,
-      // `lift-array-value`, `quake-synthesizer`, and `get-concrete-matrix`
+      // `lift-array-alloc`, `quake-synthesizer`, and `get-concrete-matrix`
       // passes.
       if (auto globalOp = dyn_cast<cudaq::cc::GlobalOp>(op))
         moduleOp.push_back(globalOp.clone());
diff --git a/runtime/cudaq/platform/default/opt-test.yml b/runtime/cudaq/platform/default/opt-test.yml
index caa1532c53b..eacf0375b2f 100644
--- a/runtime/cudaq/platform/default/opt-test.yml
+++ b/runtime/cudaq/platform/default/opt-test.yml
@@ -22,19 +22,19 @@ configuration-matrix:
     config:
       nvqir-simulation-backend: cusvsim-fp32, custatevec-fp32
       preprocessor-defines: ["-D CUDAQ_SIMULATION_SCALAR_FP32"]
-      target-pass-pipeline: "func.func(unwind-lowering),canonicalize,lambda-lifting,func.func(memtoreg{quantum=0}),canonicalize,apply-op-specialization,kernel-execution,aggressive-early-inlining,func.func(quake-add-metadata),const-prop-complex,lift-array-value,func.func(get-concrete-matrix),device-code-loader{use-quake=1},canonicalize,cse,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),canonicalize,cse,add-wireset,func.func(assign-wire-indices),dep-analysis,func.func(regtomem),symbol-dce"
+      target-pass-pipeline: "func.func(unwind-lowering),canonicalize,lambda-lifting,func.func(memtoreg{quantum=0}),canonicalize,apply-op-specialization,kernel-execution,aggressive-early-inlining,func.func(quake-add-metadata,const-prop-complex,lift-array-alloc),globalize-array-values,func.func(get-concrete-matrix),device-code-loader{use-quake=1},canonicalize,cse,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),canonicalize,cse,add-wireset,func.func(assign-wire-indices),dep-analysis,func.func(regtomem),symbol-dce"
       library-mode: false
   - name: dep-analysis-fp64
     option-flags: [dep-analysis, fp64]
     config:
       nvqir-simulation-backend: cusvsim-fp64, custatevec-fp64
       preprocessor-defines: ["-D CUDAQ_SIMULATION_SCALAR_FP64"]
-      target-pass-pipeline: "func.func(unwind-lowering),canonicalize,lambda-lifting,func.func(memtoreg{quantum=0}),canonicalize,apply-op-specialization,kernel-execution,aggressive-early-inlining,func.func(quake-add-metadata),const-prop-complex,lift-array-value,func.func(get-concrete-matrix),device-code-loader{use-quake=1},canonicalize,cse,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),canonicalize,cse,add-wireset,func.func(assign-wire-indices),dep-analysis,func.func(regtomem),symbol-dce"
+      target-pass-pipeline: "func.func(unwind-lowering),canonicalize,lambda-lifting,func.func(memtoreg{quantum=0}),canonicalize,apply-op-specialization,kernel-execution,aggressive-early-inlining,func.func(quake-add-metadata,const-prop-complex,lift-array-alloc),globalize-array-values,func.func(get-concrete-matrix),device-code-loader{use-quake=1},canonicalize,cse,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),canonicalize,cse,add-wireset,func.func(assign-wire-indices),dep-analysis,func.func(regtomem),symbol-dce"
       library-mode: false
   - name: dep-analysis-qpp
     option-flags: [dep-analysis, qpp]
     config:
       nvqir-simulation-backend: qpp
       preprocessor-defines: ["-D CUDAQ_SIMULATION_SCALAR_FP64"]
-      target-pass-pipeline: "func.func(unwind-lowering),canonicalize,lambda-lifting,func.func(memtoreg{quantum=0}),canonicalize,apply-op-specialization,kernel-execution,aggressive-early-inlining,func.func(quake-add-metadata),const-prop-complex,lift-array-value,func.func(get-concrete-matrix),device-code-loader{use-quake=1},canonicalize,cse,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),canonicalize,cse,add-wireset,func.func(assign-wire-indices),dep-analysis,func.func(regtomem),symbol-dce"
+      target-pass-pipeline: "func.func(unwind-lowering),canonicalize,lambda-lifting,func.func(memtoreg{quantum=0}),canonicalize,apply-op-specialization,kernel-execution,aggressive-early-inlining,func.func(quake-add-metadata,const-prop-complex,lift-array-alloc),globalize-array-values,func.func(get-concrete-matrix),device-code-loader{use-quake=1},canonicalize,cse,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),canonicalize,cse,add-wireset,func.func(assign-wire-indices),dep-analysis,func.func(regtomem),symbol-dce"
       library-mode: false
diff --git a/runtime/cudaq/platform/default/rest/helpers/anyon/anyon.yml b/runtime/cudaq/platform/default/rest/helpers/anyon/anyon.yml
index 3ecb49f3021..300bb038ee1 100644
--- a/runtime/cudaq/platform/default/rest/helpers/anyon/anyon.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/anyon/anyon.yml
@@ -16,7 +16,7 @@ config:
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
-  platform-lowering-config: "const-prop-complex,canonicalize,cse,lift-array-value,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),anyon-%Q_GATE%-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
+  platform-lowering-config: "func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),anyon-%Q_GATE%-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
   # Tell the rest-qpu that we are generating Adaptive QIR.
   codegen-emission: qir-adaptive
   # Library mode is only for simulators, physical backends must turn this off
diff --git a/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.yml b/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.yml
index 238d4c33163..913c91a8e26 100644
--- a/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.yml
@@ -16,7 +16,7 @@ config:
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
-  platform-lowering-config: "const-prop-complex,canonicalize,cse,lift-array-value,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),ionq-gate-set-mapping"
+  platform-lowering-config: "func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),ionq-gate-set-mapping"
   # Tell the rest-qpu that we are generating QIR.
   codegen-emission: qir-base
   # Additional passes to run after lowering to QIR
diff --git a/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.yml b/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.yml
index 0e90a1e2afa..841d4ea3665 100644
--- a/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.yml
@@ -16,7 +16,7 @@ config:
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
-  platform-lowering-config: "const-prop-complex,canonicalize,cse,lift-array-value,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),iqm-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(delay-measurements,regtomem),symbol-dce,iqm-gate-set-mapping"
+  platform-lowering-config: "func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),iqm-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(delay-measurements,regtomem),symbol-dce,iqm-gate-set-mapping"
   # Tell the rest-qpu that we are generating IQM JSON.
   codegen-emission: iqm
   # Library mode is only for simulators, physical backends must turn this off
diff --git a/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.yml b/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.yml
index 6a8a46c0667..fb02c57d960 100644
--- a/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.yml
@@ -16,7 +16,7 @@ config:
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
-  platform-lowering-config: "const-prop-complex,canonicalize,cse,lift-array-value,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),oqc-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
+  platform-lowering-config: "func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),oqc-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
   # Tell the rest-qpu that we are generating QIR.
   codegen-emission: qir-base
   # Library mode is only for simulators, physical backends must turn this off
diff --git a/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.yml b/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.yml
index 21cc45be1e3..efd4eafdf6a 100644
--- a/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.yml
@@ -16,7 +16,7 @@ config:
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
-  platform-lowering-config: "const-prop-complex,canonicalize,cse,lift-array-value,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),quantinuum-gate-set-mapping"
+  platform-lowering-config: "func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),quantinuum-gate-set-mapping"
   # Tell the rest-qpu that we are generating Adaptive QIR.
   codegen-emission: qir-adaptive
   # Library mode is only for simulators, physical backends must turn this off
diff --git a/targettests/Remote-Sim/qvector_init_from_vector.cpp b/targettests/Remote-Sim/qvector_init_from_vector.cpp
index c55ede90205..16db5bdbadd 100644
--- a/targettests/Remote-Sim/qvector_init_from_vector.cpp
+++ b/targettests/Remote-Sim/qvector_init_from_vector.cpp
@@ -19,30 +19,28 @@
 
 __qpu__ void test_large_double_constant_array() {
   std::vector<double> vec(1ULL << 19);
-  vec[0]= M_SQRT1_2/vec.size();
-  vec[1]= M_SQRT1_2/vec.size();
+  vec[0] = M_SQRT1_2 / vec.size();
+  vec[1] = M_SQRT1_2 / vec.size();
   for (std::size_t i = 2; i < vec.size(); i++) {
-    vec[i]= 0;
+    vec[i] = 0;
   }
   cudaq::qvector v(vec);
 }
 
 __qpu__ void test_complex_constant_array() {
-   cudaq::qvector v(std::vector<cudaq::complex>({ M_SQRT1_2, M_SQRT1_2, 0., 0.}));
+  cudaq::qvector v(std::vector<cudaq::complex>({M_SQRT1_2, M_SQRT1_2, 0., 0.}));
 }
 
 __qpu__ void test_complex_constant_array2() {
-   cudaq::qvector v1(std::vector<cudaq::complex>({ M_SQRT1_2, M_SQRT1_2, 0., 0.}));
-   cudaq::qvector v2(std::vector<cudaq::complex>({ 0., 0., M_SQRT1_2, M_SQRT1_2}));
+  cudaq::qvector v1(
+      std::vector<cudaq::complex>({M_SQRT1_2, M_SQRT1_2, 0., 0.}));
+  cudaq::qvector v2(
+      std::vector<cudaq::complex>({0., 0., M_SQRT1_2, M_SQRT1_2}));
 }
 
 __qpu__ void test_complex_constant_array3() {
-   cudaq::qvector v({
-    cudaq::complex(M_SQRT1_2),
-    cudaq::complex(M_SQRT1_2),
-    cudaq::complex(0.0),
-    cudaq::complex(0.0)
-  });
+  cudaq::qvector v({cudaq::complex(M_SQRT1_2), cudaq::complex(M_SQRT1_2),
+                    cudaq::complex(0.0), cudaq::complex(0.0)});
 }
 
 __qpu__ void test_complex_array_param(std::vector<cudaq::complex> inState) {
@@ -50,7 +48,7 @@ __qpu__ void test_complex_array_param(std::vector<cudaq::complex> inState) {
 }
 
 __qpu__ void test_real_constant_array() {
-  cudaq::qvector v({ M_SQRT1_2, M_SQRT1_2, 0., 0.});
+  cudaq::qvector v({M_SQRT1_2, M_SQRT1_2, 0., 0.});
 }
 
 __qpu__ void test_real_array_param(std::vector<cudaq::real> inState) {
@@ -65,7 +63,7 @@ __qpu__ void test_float_array_param(std::vector<float> inState) {
   cudaq::qvector q = inState;
 }
 
-void printCounts(cudaq::sample_result& result) {
+void printCounts(cudaq::sample_result &result) {
   std::vector<std::string> values{};
   for (auto &&[bits, counts] : result) {
     values.push_back(bits);
@@ -78,155 +76,177 @@ void printCounts(cudaq::sample_result& result) {
 }
 
 int main() {
-    {
-      auto counts = cudaq::sample(test_large_double_constant_array);
-      printCounts(counts);
-    }
+  {
+    auto counts = cudaq::sample(test_large_double_constant_array);
+    std::cout << "Part 1\n";
+    printCounts(counts);
+  }
 
-// CHECK: 0000000000000000000
-// CHECK: 1000000000000000000
+  // CHECK-LABEL: Part 1
+  // CHECK: 0000000000000000000
+  // CHECK: 1000000000000000000
 
-    {
-      auto counts = cudaq::sample(test_complex_constant_array);
-      printCounts(counts);
-    }
+  {
+    auto counts = cudaq::sample(test_complex_constant_array);
+    std::cout << "Part 2\n";
+    printCounts(counts);
+  }
 
-// CHECK: 00
-// CHECK: 10
+  // CHECK-LABEL: Part 2
+  // CHECK: 00
+  // CHECK: 10
 
-    {
-      auto counts = cudaq::sample(test_complex_constant_array2);
-      printCounts(counts);
-    }
+  {
+    auto counts = cudaq::sample(test_complex_constant_array2);
+    std::cout << "Part 3\n";
+    printCounts(counts);
+  }
 
-// CHECK: 0001
-// CHECK: 0011
-// CHECK: 1001
-// CHECK: 1011
+  // CHECK-LABEL: Part 3
+  // CHECK: 0001
+  // CHECK: 0011
+  // CHECK: 1001
+  // CHECK: 1011
 
-    {
-      auto counts = cudaq::sample(test_complex_constant_array3);
-      printCounts(counts);
-    }
+  {
+    auto counts = cudaq::sample(test_complex_constant_array3);
+    std::cout << "Part 4\n";
+    printCounts(counts);
+  }
 
-// CHECK: 00
-// CHECK: 10
+  // CHECK-LABEL: Part 4
+  // CHECK: 00
+  // CHECK: 10
 
-    {
-      auto counts = cudaq::sample(test_real_constant_array);
-      printCounts(counts);
-    }
+  {
+    auto counts = cudaq::sample(test_real_constant_array);
+    std::cout << "Part 5\n";
+    printCounts(counts);
+  }
 
-// CHECK: 00
-// CHECK: 10
+  // CHECK-LABEL: Part 5
+  // CHECK: 00
+  // CHECK: 10
 
+  {
+    std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+    std::vector<cudaq::complex> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
     {
-      std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
-      std::vector<cudaq::complex> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
-      {
-          // Passing state data as argument (kernel mode)
-          auto counts = cudaq::sample(test_complex_array_param, vec);
-          printCounts(counts);
-
-          counts = cudaq::sample(test_complex_array_param, vec1);
-          printCounts(counts);
-      }
-
-// CHECK: 00
-// CHECK: 10
-
-// CHECK: 01
-// CHECK: 11
-
-      {
-          // Passing state data as argument (builder mode)
-          auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::complex>>();
-          auto qubits = kernel.qalloc(v);
-
-          auto counts = cudaq::sample(kernel, vec);
-          printCounts(counts);
-
-          counts = cudaq::sample(kernel, vec1);
-          printCounts(counts);
-      }
+      // Passing state data as argument (kernel mode)
+      auto counts = cudaq::sample(test_complex_array_param, vec);
+      std::cout << "Part 6\n";
+      printCounts(counts);
+
+      counts = cudaq::sample(test_complex_array_param, vec1);
+      printCounts(counts);
     }
 
-// CHECK: 00
-// CHECK: 10
+    // CHECK-LABEL: Part 6
+    // CHECK: 00
+    // CHECK: 10
 
-// CHECK: 01
-// CHECK: 11
+    // CHECK: 01
+    // CHECK: 11
 
     {
-      std::vector<cudaq::real> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
-      std::vector<cudaq::real> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
-      {
-          // Passing state data as argument (kernel mode)
-          auto counts = cudaq::sample(test_real_array_param, vec);
-          printCounts(counts);
+      // Passing state data as argument (builder mode)
+      auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::complex>>();
+      auto qubits = kernel.qalloc(v);
 
-          counts = cudaq::sample(test_real_array_param, vec1);
-          printCounts(counts);
-      }
-
-// CHECK: 00
-// CHECK: 10
-
-// CHECK: 01
-// CHECK: 11
-
-      {
-          // Passing state data as argument (builder mode)
-          auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::real>>();
-          auto qubits = kernel.qalloc(v);
-
-          auto counts = cudaq::sample(kernel, vec);
-          printCounts(counts);
+      auto counts = cudaq::sample(kernel, vec);
+      std::cout << "Part 7\n";
+      printCounts(counts);
 
-          counts = cudaq::sample(kernel, vec1);
-          printCounts(counts);
-      }
+      counts = cudaq::sample(kernel, vec1);
+      printCounts(counts);
+    }
+  }
 
-// CHECK: 00
-// CHECK: 10
+  // CHECK-LABEL: Part 7
+  // CHECK: 00
+  // CHECK: 10
 
-// CHECK: 01
-// CHECK: 11
-    }
+  // CHECK: 01
+  // CHECK: 11
 
+  {
+    std::vector<cudaq::real> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+    std::vector<cudaq::real> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
     {
-      std::vector<double> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
-      std::vector<double> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
-
       // Passing state data as argument (kernel mode)
-      auto counts = cudaq::sample(test_double_array_param, vec);
+      auto counts = cudaq::sample(test_real_array_param, vec);
+      std::cout << "Part 8\n";
       printCounts(counts);
 
-      counts = cudaq::sample(test_double_array_param, vec1);
+      counts = cudaq::sample(test_real_array_param, vec1);
       printCounts(counts);
     }
 
-// CHECK: 00
-// CHECK: 10
+    // CHECK-LABEL: Part 8
+    // CHECK: 00
+    // CHECK: 10
 
-// CHECK: 01
-// CHECK: 11
+    // CHECK: 01
+    // CHECK: 11
 
     {
-      std::vector<float> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
-      std::vector<float> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+      // Passing state data as argument (builder mode)
+      auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::real>>();
+      auto qubits = kernel.qalloc(v);
 
-      // Passing state data as argument (kernel mode)
-      auto counts = cudaq::sample(test_float_array_param, vec);
+      auto counts = cudaq::sample(kernel, vec);
+      std::cout << "Part 9\n";
       printCounts(counts);
 
-      counts = cudaq::sample(test_float_array_param, vec1);
+      counts = cudaq::sample(kernel, vec1);
       printCounts(counts);
     }
 
-// CHECK: 00
-// CHECK: 10
+    // CHECK-LABEL: Part 9
+    // CHECK: 00
+    // CHECK: 10
+
+    // CHECK: 01
+    // CHECK: 11
+  }
+
+  {
+    std::vector<double> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+    std::vector<double> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+
+    // Passing state data as argument (kernel mode)
+    auto counts = cudaq::sample(test_double_array_param, vec);
+    std::cout << "Part 10\n";
+    printCounts(counts);
+
+    counts = cudaq::sample(test_double_array_param, vec1);
+    printCounts(counts);
+  }
+
+  // CHECK-LABEL: Part 10
+  // CHECK: 00
+  // CHECK: 10
+
+  // CHECK: 01
+  // CHECK: 11
+
+  {
+    std::vector<float> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+    std::vector<float> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+
+    // Passing state data as argument (kernel mode)
+    auto counts = cudaq::sample(test_float_array_param, vec);
+    std::cout << "Part 11\n";
+    printCounts(counts);
+
+    counts = cudaq::sample(test_float_array_param, vec1);
+    printCounts(counts);
+  }
+
+  // CHECK-LABEL: Part 11
+  // CHECK: 00
+  // CHECK: 10
 
-// CHECK: 01
-// CHECK: 11
+  // CHECK: 01
+  // CHECK: 11
 }
diff --git a/targettests/TargetConfig/RegressionValidation/anyon.config b/targettests/TargetConfig/RegressionValidation/anyon.config
index a281c7a156f..5c81c0c3e0c 100644
--- a/targettests/TargetConfig/RegressionValidation/anyon.config
+++ b/targettests/TargetConfig/RegressionValidation/anyon.config
@@ -20,7 +20,7 @@
 # Define the lowering pipeline. telegraph-8q has an 8-qubit ring topology, so mapping
 # uses ring(8).
 # Berkeley-25q uses a bidiratctional connectivity lattice with 8 connectivity per qubit in the bulk.
-# CHECK-DAG: PLATFORM_LOWERING_CONFIG="const-prop-complex,canonicalize,cse,lift-array-value,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),anyon-%Q_GATE%-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
+# CHECK-DAG: PLATFORM_LOWERING_CONFIG="func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),anyon-%Q_GATE%-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
 
 
 # Tell the rest-qpu that we are generating QIR.
@@ -49,4 +49,4 @@ while [ $# -gt 1 ]; do
 		;;
 	esac
 	shift 2
-done
\ No newline at end of file
+done
diff --git a/targettests/TargetConfig/RegressionValidation/ionq.config b/targettests/TargetConfig/RegressionValidation/ionq.config
index ca18d8286a6..1d11dbd3511 100644
--- a/targettests/TargetConfig/RegressionValidation/ionq.config
+++ b/targettests/TargetConfig/RegressionValidation/ionq.config
@@ -18,7 +18,7 @@
 # CHECK-DAG: LINKLIBS="${LINKLIBS} -lcudaq-rest-qpu"
 
 # Define the lowering pipeline
-# CHECK-DAG: PLATFORM_LOWERING_CONFIG="const-prop-complex,canonicalize,cse,lift-array-value,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),ionq-gate-set-mapping"
+# CHECK-DAG: PLATFORM_LOWERING_CONFIG="func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),ionq-gate-set-mapping"
 
 # Tell the rest-qpu that we are generating QIR.
 # CHECK-DAG: CODEGEN_EMISSION=qir-base
diff --git a/targettests/TargetConfig/RegressionValidation/iqm.config b/targettests/TargetConfig/RegressionValidation/iqm.config
index 073e269408a..4db04b874e1 100644
--- a/targettests/TargetConfig/RegressionValidation/iqm.config
+++ b/targettests/TargetConfig/RegressionValidation/iqm.config
@@ -20,7 +20,7 @@
 # Define the lowering pipeline, here we lower to Base QIR
 # Note: the runtime will dynamically substitute %QPU_ARCH% based on
 # qpu-architecture
-# CHECK-DAG: PLATFORM_LOWERING_CONFIG="const-prop-complex,canonicalize,cse,lift-array-value,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),iqm-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(delay-measurements,regtomem),symbol-dce,iqm-gate-set-mapping"
+# CHECK-DAG: PLATFORM_LOWERING_CONFIG="func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),iqm-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(delay-measurements,regtomem),symbol-dce,iqm-gate-set-mapping"
 
 # Tell the rest-qpu that we are generating IQM JSON.
 # CHECK-DAG: CODEGEN_EMISSION=iqm
diff --git a/targettests/TargetConfig/RegressionValidation/oqc.config b/targettests/TargetConfig/RegressionValidation/oqc.config
index adbceff0125..bd81dfe9035 100644
--- a/targettests/TargetConfig/RegressionValidation/oqc.config
+++ b/targettests/TargetConfig/RegressionValidation/oqc.config
@@ -20,7 +20,7 @@
 # Define the lowering pipeline. Lucy has an 8-qubit ring topology, so mapping
 # uses ring(8).
 # Toshiko uses a Kagome lattice with 2-3 connectivity per qubit
-# CHECK-DAG: PLATFORM_LOWERING_CONFIG="const-prop-complex,canonicalize,cse,lift-array-value,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),oqc-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
+# CHECK-DAG: PLATFORM_LOWERING_CONFIG="func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),oqc-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
 
 
 # Tell the rest-qpu that we are generating QIR.
diff --git a/targettests/TargetConfig/RegressionValidation/quantinuum.config b/targettests/TargetConfig/RegressionValidation/quantinuum.config
index c899422b0fa..d7f7c32d852 100644
--- a/targettests/TargetConfig/RegressionValidation/quantinuum.config
+++ b/targettests/TargetConfig/RegressionValidation/quantinuum.config
@@ -18,7 +18,7 @@
 # CHECK-DAG: LINKLIBS="${LINKLIBS} -lcudaq-rest-qpu"
 
 # Define the lowering pipeline, here we lower to Adaptive QIR
-# CHECK-DAG: PLATFORM_LOWERING_CONFIG="const-prop-complex,canonicalize,cse,lift-array-value,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),quantinuum-gate-set-mapping"
+# CHECK-DAG: PLATFORM_LOWERING_CONFIG="func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,state-prep,unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),quantinuum-gate-set-mapping"
 
 # Tell the rest-qpu that we are generating QIR.
 # CHECK-DAG: CODEGEN_EMISSION=qir-adaptive
diff --git a/test/AST-Quake/custom_op_concrete_matrix.cpp b/test/AST-Quake/custom_op_concrete_matrix.cpp
index f7a5b967344..5ab0313fabd 100644
--- a/test/AST-Quake/custom_op_concrete_matrix.cpp
+++ b/test/AST-Quake/custom_op_concrete_matrix.cpp
@@ -6,7 +6,9 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// RUN: cudaq-quake %cpp_std %s | cudaq-opt -const-prop-complex -lift-array-value -get-concrete-matrix | FileCheck %s
+// clang-format off
+// RUN: cudaq-quake %cpp_std %s | cudaq-opt -const-prop-complex -lift-array-alloc -globalize-array-values -get-concrete-matrix | FileCheck %s
+// clang-format on
 
 #include <cudaq.h>
 
@@ -16,13 +18,13 @@ CUDAQ_REGISTER_OPERATION(custom_h, 1, 0,
 CUDAQ_REGISTER_OPERATION(custom_cnot, 2, 0,
                          {1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0})
 
-
 __qpu__ void kernel_1() {
   cudaq::qubit q, r;
   custom_h(q);
   custom_cnot(q, r);
 }
 
+// clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_kernel_1._Z8kernel_1v() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
 // CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
 // CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.ref
@@ -31,5 +33,5 @@ __qpu__ void kernel_1() {
 // CHECK:           return
 // CHECK:         }
 
-// CHECK:         cc.global constant @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_{{.*}}vectorId{{.*}}.rodata_{{[0-9]+}} (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (-0.70710678118654757,0.000000e+00)]> : tensor<4xcomplex<f64>>) : !cc.array<complex<f64> x 4>
-// CHECK:         cc.global constant @__nvqpp__mlirgen__function_custom_cnot_generator_2._Z23custom_cnot_generator_{{.*}}vectorId{{.*}}.rodata_{{[0-9]+}} (dense<[(1.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (1.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (1.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (1.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<16xcomplex<f64>>) : !cc.array<complex<f64> x 16>
+// CHECK-DAG:     cc.global constant @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_{{.*}}vectorId{{.*}}.rodata_{{[0-9]+}} (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (-0.70710678118654757,0.000000e+00)]> : tensor<4xcomplex<f64>>) : !cc.array<complex<f64> x 4>
+// CHECK-DAG:     cc.global constant @__nvqpp__mlirgen__function_custom_cnot_generator_2._Z23custom_cnot_generator_{{.*}}vectorId{{.*}}.rodata_{{[0-9]+}} (dense<[(1.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (1.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (1.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (1.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<16xcomplex<f64>>) : !cc.array<complex<f64> x 16>
diff --git a/test/Quake/lift_array.qke b/test/Quake/lift_array.qke
index 73a450d42c5..b7cbcec5cdf 100644
--- a/test/Quake/lift_array.qke
+++ b/test/Quake/lift_array.qke
@@ -6,7 +6,8 @@
 // the terms of the Apache License 2.0 which accompanies this distribution.   //
 // ========================================================================== //
 
-// RUN: cudaq-opt -lift-array-value %s | FileCheck %s
+// RUN: cudaq-opt -lift-array-alloc %s | FileCheck %s
+// RXN: cudaq-opt -lift-array-alloc -globalize-array-values %s | FileCheck --check-prefix=GLOBAL %s
 
 func.func @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
   %cst = complex.constant [0.707106769 : f32, 0.000000e+00 : f32] : complex<f32>
@@ -26,12 +27,21 @@ func.func @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_compl
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_0:.*]] = cc.address_of @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv.rodata_{{[0-9]+}} : !cc.ptr<!cc.array<complex<f32> x 4>>
-// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_2:.*]] = quake.init_state %[[VAL_1]], %[[VAL_0]] : (!quake.veq<2>, !cc.ptr<!cc.array<complex<f32> x 4>>) -> !quake.veq<2>
+// CHECK:           %[[VAL_0:.*]] = cc.const_array {{\[\[}}0.707106769 : f32, 0.000000e+00 : f32], [0.707106769 : f32, 0.000000e+00 : f32], [0.000000e+00 : f32, 0.000000e+00 : f32], [0.000000e+00 : f32, 0.000000e+00 : f32]] : !cc.array<complex<f32> x 4>
+// CHECK:           %[[VAL_1:.*]] = cc.alloca !cc.array<complex<f32> x 4>
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<!cc.array<complex<f32> x 4>>
+// CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+// CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_4:.*]] = quake.init_state %[[VAL_3]], %[[VAL_2]] : (!quake.veq<2>, !cc.ptr<complex<f32>>) -> !quake.veq<2>
 // CHECK:           return
 // CHECK:         }
 
+// GLOBAL-LABEL:   func.func @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// GLOBAL:           %[[VAL_0:.*]] = cc.address_of @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv.rodata_{{[0-9]+}} : !cc.ptr<!cc.array<complex<f32> x 4>>
+// GLOBAL:           %[[VAL_1:.*]] = quake.alloca !quake.veq<2>
+// GLOBAL:           %[[VAL_2:.*]] = quake.init_state %[[VAL_1]], %[[VAL_0]] : (!quake.veq<2>, !cc.ptr<!cc.array<complex<f32> x 4>>) -> !quake.veq<2>
+// GLOBAL:           return
+// GLOBAL:         }
 
 func.func private @__nvqpp_vectorCopyCtor(!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
 
@@ -58,13 +68,25 @@ func.func @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generato
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v() -> !cc.stdvec<complex<f64>> attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 16 : i64
 // CHECK:           %[[VAL_1:.*]] = arith.constant 4 : i64
-// CHECK:           %[[VAL_2:.*]] = cc.address_of @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v.rodata_{{[0-9]+}} : !cc.ptr<!cc.array<complex<f64> x 4>>
-// CHECK:           %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_4:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_3]], %[[VAL_1]], %[[VAL_0]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_5:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_1]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<complex<f64>>
-// CHECK:           return %[[VAL_5]] : !cc.stdvec<complex<f64>>
+// CHECK:           %[[VAL_2:.*]] = cc.const_array {{\[\[}}0.70710678118654757, 0.000000e+00], [0.70710678118654757, 0.000000e+00], [0.70710678118654757, 0.000000e+00], [-0.70710678118654757, 0.000000e+00]] : !cc.array<complex<f64> x 4>
+// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.array<complex<f64> x 4>
+// CHECK:           cc.store %[[VAL_2]], %[[VAL_3]] : !cc.ptr<!cc.array<complex<f64> x 4>>
+// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_5:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_4]], %[[VAL_1]], %[[VAL_0]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_6:.*]] = cc.stdvec_init %[[VAL_5]], %[[VAL_1]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<complex<f64>>
+// CHECK:           return %[[VAL_6]] : !cc.stdvec<complex<f64>>
 // CHECK:         }
 
+// GLOBAL-LABEL:   func.func @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v() -> !cc.stdvec<complex<f64>> attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// GLOBAL:           %[[VAL_0:.*]] = arith.constant 16 : i64
+// GLOBAL:           %[[VAL_1:.*]] = arith.constant 4 : i64
+// GLOBAL:           %[[VAL_2:.*]] = cc.address_of @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v.rodata_{{[0-9]+}} : !cc.ptr<!cc.array<complex<f64> x 4>>
+// GLOBAL:           %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<i8>
+// GLOBAL:           %[[VAL_4:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_3]], %[[VAL_1]], %[[VAL_0]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
+// GLOBAL:           %[[VAL_5:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_1]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<complex<f64>>
+// GLOBAL:           return %[[VAL_5]] : !cc.stdvec<complex<f64>>
+// GLOBAL:         }
+
 func.func @test2() -> !quake.veq<2> {
   %cst = arith.constant 9.000000e+00 : f64
   %cst_0 = arith.constant 6.000000e+00 : f64
@@ -85,12 +107,21 @@ func.func @test2() -> !quake.veq<2> {
 }
 
 // CHECK-LABEL:   func.func @test2() -> !quake.veq<2> {
-// CHECK:           %[[VAL_0:.*]] = cc.address_of @test2.rodata_{{[0-9]+}} : !cc.ptr<!cc.array<f64 x 4>>
-// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_2:.*]] = quake.init_state %[[VAL_1]], %[[VAL_0]] : (!quake.veq<2>, !cc.ptr<!cc.array<f64 x 4>>) -> !quake.veq<2>
-// CHECK:           return %[[VAL_2]] : !quake.veq<2>
+// CHECK:           %[[VAL_0:.*]] = cc.const_array [1.000000e+00, 2.000000e+00, 6.000000e+00, 9.000000e+00] : !cc.array<f64 x 4>
+// CHECK:           %[[VAL_1:.*]] = cc.alloca !cc.array<f64 x 4>
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<!cc.array<f64 x 4>>
+// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_3:.*]] = quake.init_state %[[VAL_2]], %[[VAL_1]] : (!quake.veq<2>, !cc.ptr<!cc.array<f64 x 4>>) -> !quake.veq<2>
+// CHECK:           return %[[VAL_3]] : !quake.veq<2>
 // CHECK:         }
 
-// CHECK-DAG:     cc.global constant @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv.rodata_{{[0-9]+}} (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<4xcomplex<f32>>) : !cc.array<complex<f32> x 4>
-// CHECK-DAG:     cc.global constant @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v.rodata_{{[0-9]+}} (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (-0.70710678118654757,0.000000e+00)]> : tensor<4xcomplex<f64>>) : !cc.array<complex<f64> x 4>
-// CHECK-DAG:     cc.global constant @test2.rodata_{{[0-9]+}} (dense<[1.000000e+00, 2.000000e+00, 6.000000e+00, 9.000000e+00]> : tensor<4xf64>) : !cc.array<f64 x 4>
+// GLOBAL-LABEL:   func.func @test2() -> !quake.veq<2> {
+// GLOBAL:           %[[VAL_0:.*]] = cc.address_of @test2.rodata_{{[0-9]+}} : !cc.ptr<!cc.array<f64 x 4>>
+// GLOBAL:           %[[VAL_1:.*]] = quake.alloca !quake.veq<2>
+// GLOBAL:           %[[VAL_2:.*]] = quake.init_state %[[VAL_1]], %[[VAL_0]] : (!quake.veq<2>, !cc.ptr<!cc.array<f64 x 4>>) -> !quake.veq<2>
+// GLOBAL:           return %[[VAL_2]] : !quake.veq<2>
+// GLOBAL:         }
+
+// GLOBAL-DAG:     cc.global constant @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv.rodata_{{[0-9]+}} (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<4xcomplex<f32>>) : !cc.array<complex<f32> x 4>
+// GLOBAL-DAG:     cc.global constant @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v.rodata_{{[0-9]+}} (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (-0.70710678118654757,0.000000e+00)]> : tensor<4xcomplex<f64>>) : !cc.array<complex<f64> x 4>
+// GLOBAL-DAG:     cc.global constant @test2.rodata_{{[0-9]+}} (dense<[1.000000e+00, 2.000000e+00, 6.000000e+00, 9.000000e+00]> : tensor<4xf64>) : !cc.array<f64 x 4>
diff --git a/tools/nvqpp/nvq++.in b/tools/nvqpp/nvq++.in
index 57059a2ddb2..9b487b73e88 100644
--- a/tools/nvqpp/nvq++.in
+++ b/tools/nvqpp/nvq++.in
@@ -712,7 +712,7 @@ if ${ENABLE_AGGRESSIVE_EARLY_INLINE}; then
 fi
 if ${ENABLE_DEVICE_CODE_LOADERS}; then
 	RUN_OPT=true
-	OPT_PASSES=$(add_pass_to_pipeline "${OPT_PASSES}" "func.func(quake-add-metadata),const-prop-complex,lift-array-value,func.func(get-concrete-matrix),device-code-loader")
+	OPT_PASSES=$(add_pass_to_pipeline "${OPT_PASSES}" "func.func(quake-add-metadata,const-prop-complex,lift-array-alloc),globalize-array-values,func.func(get-concrete-matrix),device-code-loader")
 fi
 if ${ENABLE_LOWER_TO_CFG}; then
 	RUN_OPT=true