diff --git a/compiler/plugins/target/AMD-AIE/aie/AMDAIECreatePathFindFlows.cpp b/compiler/plugins/target/AMD-AIE/aie/AMDAIECreatePathFindFlows.cpp
index d531ec619..89e7bfd8b 100644
--- a/compiler/plugins/target/AMD-AIE/aie/AMDAIECreatePathFindFlows.cpp
+++ b/compiler/plugins/target/AMD-AIE/aie/AMDAIECreatePathFindFlows.cpp
@@ -6,15 +6,12 @@
 
 #include <algorithm>
 #include <cassert>
-#include <list>
 #include <set>
 
 #include "AIEDialect.h"
 #include "Passes.h"
 #include "iree-amd-aie/aie_runtime/iree_aie_router.h"
 #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
-#include "llvm/ADT/DenseMapInfo.h"
-#include "llvm/Support/raw_os_ostream.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
 
diff --git a/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp b/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp
index f15990681..5e73be939 100644
--- a/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp
+++ b/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp
@@ -10,7 +10,6 @@
 #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/Support/Format.h"
 #include "mlir/IR/AsmState.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -512,6 +511,7 @@ struct AMDAIEDmaToNpuPass : mlir::OperationPass<DeviceOp> {
 
     instructions[2] = count;
     instructions[3] = instructions.size() * sizeof(uint32_t);
+
     ArrayRef<uint32_t> instsArrRef(instructions.data(), instructions.size());
     device->setAttr(
         "npu_instructions",
@@ -521,24 +521,24 @@ struct AMDAIEDmaToNpuPass : mlir::OperationPass<DeviceOp> {
                 IntegerType::get(&getContext(), 32, IntegerType::Unsigned)),
             "npu_instructions",
             HeapAsmResourceBlob::allocateAndCopyInferAlign(instsArrRef)));
-    // The LX instructions for the entry point function are already generated by
-    // the pass hence we can safely delete the function as it is of no use to
-    // us. A reason to do this is that otherwise it is unceseccarily lowered to
-    // llvm where it can have a chance to crash in case the argument list is not
-    // lowerable for reasons such as memref's with dynamic offsets.
-    auto symName = dyn_cast_or_null<StringAttr>(device->getAttr("sym_name"));
+
     SmallVector<RuntimeSequenceOp> seqOps;
-    device->walk([&](RuntimeSequenceOp seqOp) {
-      // if the deviceOp has a symbol name attached to it we look for the
-      // sequence op that partically matches that symbol, if not we collect all
-      // sequenceOps.
-      if (!symName ||
-          symName.str().find(seqOp.getSymName()->str()) != std::string::npos)
-        seqOps.push_back(seqOp);
-    });
-    // If exactly one entry point function is found we can delete it. For any
-    // other result we do not make any change.
-    if (seqOps.size() == 1) seqOps[0].erase();
+    device->walk([&](RuntimeSequenceOp seqOp) { seqOps.push_back(seqOp); });
+
+    if (seqOps.size() > 1) {
+      device->emitOpError("has ")
+          << seqOps.size()
+          << " aiex.runtime_sequence ops. Expected no more than 1.";
+      signalPassFailure();
+    }
+
+    if (seqOps.size() == 1) {
+      auto seqOp = seqOps[0];
+      StringRef name = seqOp.getSymName().value();
+      device->setAttr("runtime_sequence_name",
+                      StringAttr::get(&getContext(), name));
+      seqOp.erase();
+    }
   }
 };
 
diff --git a/compiler/plugins/target/AMD-AIE/aie/AMDAIEXToStandard.cpp b/compiler/plugins/target/AMD-AIE/aie/AMDAIEXToStandard.cpp
deleted file mode 100644
index 4e0de9c09..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/AMDAIEXToStandard.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright 2024 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "AIEXDialect.h"
-#include "Passes.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/DialectConversion.h"
-
-using namespace mlir;
-using namespace xilinx;
-using namespace xilinx::AIE;
-using namespace xilinx::AIEX;
-
-#define DEBUG_TYPE "amdaiex-standard-lowering"
-
-template <typename MyAIEXOp>
-struct AMDAIEXOpRemoval : OpConversionPattern<MyAIEXOp> {
-  using OpConversionPattern<MyAIEXOp>::OpConversionPattern;
-  using OpAdaptor = typename MyAIEXOp::Adaptor;
-  ModuleOp &module;
-
-  AMDAIEXOpRemoval(MLIRContext *context, ModuleOp &m,
-                   PatternBenefit benefit = 1)
-      : OpConversionPattern<MyAIEXOp>(context, benefit), module(m) {}
-
-  LogicalResult matchAndRewrite(
-      MyAIEXOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    Operation *Op = op.getOperation();
-    rewriter.eraseOp(Op);
-    return success();
-  }
-};
-
-namespace mlir::iree_compiler::AMDAIE {
-struct AMDAIEXToStandardPass : mlir::OperationPass<mlir::ModuleOp> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(AMDAIEXToStandardPass)
-
-  AMDAIEXToStandardPass()
-      : mlir::OperationPass<mlir::ModuleOp>(resolveTypeID()) {}
-
-  llvm::StringRef getArgument() const override {
-    return "amdaiex-standard-lowering";
-  }
-
-  llvm::StringRef getName() const override { return "AMDAIEXToStandardPass"; }
-
-  std::unique_ptr<mlir::Pass> clonePass() const override {
-    return std::make_unique<AMDAIEXToStandardPass>(
-        *static_cast<const AMDAIEXToStandardPass *>(this));
-  }
-
-  void getDependentDialects(::mlir::DialectRegistry &registry) const override {
-    registry.insert<mlir::func::FuncDialect>();
-    registry.insert<mlir::memref::MemRefDialect>();
-    registry.insert<xilinx::AIE::AIEDialect>();
-    registry.insert<xilinx::AIEX::AIEXDialect>();
-  }
-
-  void runOnOperation() override {
-    ModuleOp m = getOperation();
-    ConversionTarget target(getContext());
-    RewritePatternSet removepatterns(&getContext());
-    removepatterns.add<AMDAIEXOpRemoval<NpuDmaMemcpyNdOp>>(m.getContext(), m);
-    removepatterns.add<AMDAIEXOpRemoval<NpuDmaWaitOp>>(m.getContext(), m);
-    removepatterns.add<AMDAIEXOpRemoval<NpuPushQueueOp>>(m.getContext(), m);
-    removepatterns.add<AMDAIEXOpRemoval<NpuWrite32Op>>(m.getContext(), m);
-    removepatterns.add<AMDAIEXOpRemoval<NpuSyncOp>>(m.getContext(), m);
-    removepatterns.add<AMDAIEXOpRemoval<NpuWriteBdOp>>(m.getContext(), m);
-    removepatterns.add<AMDAIEXOpRemoval<NpuAddressPatchOp>>(m.getContext(), m);
-
-    if (failed(applyPartialConversion(m, target, std::move(removepatterns))))
-      signalPassFailure();
-  }
-};
-
-std::unique_ptr<OperationPass<ModuleOp>> createAMDAIEXToStandardPass() {
-  return std::make_unique<AMDAIEXToStandardPass>();
-}
-
-void registerAMDAIEXToStandardPass() {
-  mlir::registerPass([]() -> std::unique_ptr<mlir::Pass> {
-    return createAMDAIEXToStandardPass();
-  });
-}
-}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt
index db5c1e449..52244c48a 100644
--- a/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt
@@ -142,7 +142,6 @@ iree_cc_library(
     AMDAIELocalizeLocks.cpp
     AMDAIENormalizeAddressSpaces.cpp
     AMDAIEObjectFifoStatefulTransform.cpp
-    AMDAIEXToStandard.cpp
   DEPS
     iree-amd-aie::aie_runtime::iree_aie_runtime_static
     ::AIEDialectIR
diff --git a/compiler/plugins/target/AMD-AIE/aie/Passes.h b/compiler/plugins/target/AMD-AIE/aie/Passes.h
index 347c32757..bf9e64477 100644
--- a/compiler/plugins/target/AMD-AIE/aie/Passes.h
+++ b/compiler/plugins/target/AMD-AIE/aie/Passes.h
@@ -34,7 +34,6 @@ createAMDAIEPathfinderPass();
 std::unique_ptr<OperationPass<ModuleOp>> createAMDAIECoreToStandardPass();
 std::unique_ptr<OperationPass<xilinx::AIE::DeviceOp>>
 createAMDAIEDmaToNpuPass();
-std::unique_ptr<OperationPass<ModuleOp>> createAMDAIEXToStandardPass();
 
 void registerAMDAIEAssignBufferAddressesBasic();
 void registerAMDAIEAssignBufferDescriptorIDs();
@@ -44,9 +43,8 @@ void registerAMDAIELocalizeLocks();
 void registerAMDAIENormalizeAddressSpaces();
 void registerAMDAIEObjectFifoStatefulTransform();
 void registerAMDAIERoutePathfinderFlows();
-
 void registerAMDAIEDmaToNpu();
-void registerAMDAIEXToStandardPass();
+
 }  // namespace mlir::iree_compiler::AMDAIE
 
 #endif  // AMDAIE_PASSES_H_
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/aiert_insts.mlir b/compiler/plugins/target/AMD-AIE/aie/test/aiert_insts.mlir
index cb82fcd22..adaff90b4 100644
--- a/compiler/plugins/target/AMD-AIE/aie/test/aiert_insts.mlir
+++ b/compiler/plugins/target/AMD-AIE/aie/test/aiert_insts.mlir
@@ -5,7 +5,7 @@
 // CHECK:           memref.global "public" @of_fromMem : memref<32xi32>
 // CHECK:           aie.shim_dma_allocation @of_fromMem(MM2S, 0, 0)
 // CHECK:           aie.shim_dma_allocation @of_toMem(S2MM, 0, 0)
-// CHECK:         } {npu_instructions = dense_resource<npu_instructions> : tensor<64xui32>}
+// CHECK:         } {npu_instructions = dense_resource<npu_instructions> : tensor<64xui32>, runtime_sequence_name = "sequence"}
 
 // CHECK:         {-#
 // CHECK:           dialect_resources: {
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/aiex_standard_lowering.mlir b/compiler/plugins/target/AMD-AIE/aie/test/aiex_standard_lowering.mlir
deleted file mode 100644
index 015aea837..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/aiex_standard_lowering.mlir
+++ /dev/null
@@ -1,22 +0,0 @@
-
-// RUN: iree-opt --amdaiex-standard-lowering %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(npu1_4col) {
-// CHECK:           memref.global "public" @toMem : memref<16xi32>
-// CHECK:           func.func @dma_and_wait(%[[ARG0:.*]]: memref<16xi32>, %[[ARG1:.*]]: memref<16xi32>) {
-// CHECK:             return
-// CHECK:           }
-// CHECK:           aie.shim_dma_allocation @toMem(MM2S, 1, 1)
-// CHECK:         }
-
-module  {
-  aie.device(npu1_4col) {
-    memref.global "public" @toMem : memref<16xi32>
-    func.func @dma_and_wait(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
-      aiex.npu.dma_wait {symbol = @toMem}
-      return
-    }
-    aie.shim_dma_allocation @toMem (MM2S, 1, 1)
-  }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/dma_to_npu.mlir b/compiler/plugins/target/AMD-AIE/aie/test/dma_to_npu.mlir
index f4cfd5647..3a78c854c 100644
--- a/compiler/plugins/target/AMD-AIE/aie/test/dma_to_npu.mlir
+++ b/compiler/plugins/target/AMD-AIE/aie/test/dma_to_npu.mlir
@@ -77,26 +77,3 @@ module  {
   } {sym_name = "explicit_sym_name_0"}
 }
 
-// -----
-
-// CHECK-LABEL:   aie.device(npu1_4col) {
-// CHECK:           memref.global "public" @toMem : memref<16xi32>
-// CHECK:           func.func @pretend_microkernel
-// CHECK:           aiex.runtime_sequence  @explicit_sym_name
-// CHECK:           aie.shim_dma_allocation @toMem(MM2S, 1, 1)
-
-module  {
-  aie.device(npu1_4col) {
-    memref.global "public" @toMem : memref<16xi32>
-    func.func @pretend_microkernel(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
-      return
-    }
-
-    aiex.runtime_sequence @explicit_sym_name(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
-      aiex.npu.dma_wait {symbol = @toMem}
-    }
-    aie.shim_dma_allocation @toMem (MM2S, 1, 1)
-  } {sym_name = "wrong_sym_name"}
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp
index 2ef29d294..50d72b077 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp
@@ -34,7 +34,6 @@ struct AMDAIESession
     AMDAIE::registerAMDAIEObjectFifoStatefulTransform();
     AMDAIE::registerAMDAIERoutePathfinderFlows();
     AMDAIE::registerAMDAIEDmaToNpu();
-    AMDAIE::registerAMDAIEXToStandardPass();
     AMDAIE::registerAIRConversionPasses();
     AMDAIE::registerAIRTransformPasses();
     aievec::registerConvertAIEVecToLLVMPass();
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp
index 8f764732a..515cc5be1 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp
@@ -16,7 +16,6 @@
 #include "aievec/XLLVMDialect.h"
 #include "air/Dialect/AIR/AIRDialect.h"
 #include "air/Dialect/AIRRt/AIRRtDialect.h"
-#include "iree-amd-aie/IR/AMDAIEAttrs.h"
 #include "iree-amd-aie/IR/AMDAIEDialect.h"
 #include "iree-amd-aie/Transforms/Passes.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
@@ -27,9 +26,12 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/ToolOutputFile.h"
+#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
+#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
-#include "mlir/Conversion/Passes.h"
+#include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"
+#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
+#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/EmitC/IR/EmitC.h"
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"
@@ -75,42 +77,27 @@ static llvm::cl::opt<std::string> clEnableAMDAIEUkernels(
                    "unprefixed microkernels to enable, e.g. `matmul`."),
     llvm::cl::init("none"));
 
-// Utility to find aie.device Op corresponding to the export Op.
-// For example, we have
-// hal.executable.variant {
-//   hal.executable.export symbol1
-//   hal.executable.export symbol2
-//   module {
-//     aie.device {
-//       ...
-//       aiex.runtime_sequence symbol1
-//     }
-//     aie.device {
-//       ...
-//       aiex.runtime_sequence symbol2
-//     }
-//   }
-// }
-// Hence we need to find the aiex.runtime_sequence that coresponds to the export
-// op symbol and return its parent aie.device Op. This is what we will pass to
-// the `aie2xclbin` tool for artifact generation per entry point.
-static xilinx::AIE::DeviceOp getDeviceOpFromEntryPoint(ModuleOp moduleOp,
-                                                       StringRef exportOpName) {
+static xilinx::AIE::DeviceOp getDeviceOpWithName(ModuleOp moduleOp,
+                                                 StringRef targetName) {
   xilinx::AIE::DeviceOp deviceOp;
 
-  moduleOp.walk([&](xilinx::AIEX::RuntimeSequenceOp sequenceOp) {
-    if (sequenceOp.getSymName() == exportOpName) {
-      deviceOp =
-          dyn_cast_or_null<xilinx::AIE::DeviceOp>(sequenceOp->getParentOp());
-      return WalkResult::interrupt();
-    }
-    return WalkResult::advance();
+  uint32_t nDeviceOpsVisited = 0;
+  moduleOp.walk([&](xilinx::AIE::DeviceOp d) {
+    ++nDeviceOpsVisited;
+    // This attribute should've been set in the dma-to-npu pass.
+    auto maybeName = d->getAttrOfType<StringAttr>("runtime_sequence_name");
+    if (!maybeName) return WalkResult::advance();
+    auto name = maybeName.getValue();
+    if (name != targetName) return WalkResult::advance();
+    deviceOp = d;
+    return WalkResult::interrupt();
   });
-  if (!deviceOp) {
-    moduleOp.emitError()
-        << "failed to find aie.device containing func.func with symbol "
-        << exportOpName;
-  }
+
+  if (!deviceOp)
+    moduleOp.emitError() << "visited " << nDeviceOpsVisited
+                         << " aie.device ops, and failed to find one with name "
+                         << targetName;
+
   return deviceOp;
 }
 
@@ -291,7 +278,7 @@ LogicalResult AIETargetBackend::serializeExecutable(
     }
 
     StringRef exportOpName = exportOp.getSymName();
-    deviceOps.push_back(getDeviceOpFromEntryPoint(moduleOp, exportOpName));
+    deviceOps.push_back(getDeviceOpWithName(moduleOp, exportOpName));
 
     // The xclbin kernel name, appended with instance name suffix (`:MLIRAIEV1`,
     // 10 chars) is required by the xclbinutil to have a length smaller or equal
@@ -334,21 +321,8 @@ LogicalResult AIETargetBackend::serializeExecutable(
     uint64_t ordinal = entryPointOrdinals.at(entryPointNames[i]);
 
     entryPointNamesFb[ordinal] = entryPointNames[i];
-
-    SmallString<128> inputMlirPath(workDir);
-    llvm::sys::path::append(inputMlirPath,
-                            entryPointNamesFb[ordinal] + ".aiecc.mlir");
-
     std::string errorMessage;
-    {
-      auto inputMlirOut = openOutputFile(inputMlirPath, &errorMessage);
-      if (!inputMlirOut) {
-        return moduleOp.emitOpError()
-               << "Failed to write MLIR: " << errorMessage;
-      }
-      deviceOps[i].print(inputMlirOut->os(), OpPrintingFlags().useLocalScope());
-      inputMlirOut->keep();
-    }
+
     // we add the entry point to the working directory for xclbin artifacts if
     // there are multiple entry points so that we dont overwrite the xclbinutil
     // generated artifacts e.g kernels.json, for different entry points which
@@ -375,11 +349,22 @@ LogicalResult AIETargetBackend::serializeExecutable(
     ParserConfig pcfg(variantOp->getContext());
     llvm::SourceMgr srcMgr;
 
-    OwningOpRef<ModuleOp> owningModuleOp =
-        parseSourceFile<ModuleOp>(inputMlirPath, srcMgr, pcfg);
+    // Move DeviceOp into its own ModuleOp, if there are multiple DeviceOps.
+    // Required as core-to-standard pass will move all ops in DeviceOps into
+    // the parent ModuleOp, so if they're not separated, core code between
+    // DeviceOps gets incorrectly concatenated. There's probably a simpler
+    // workaround, to be reviewed as we continue to remove layers of crust.
+    if (deviceOps.size() > 1) {
+      OpBuilder opBuilder(deviceOps[i].getContext());
+      auto moduleWithOneDevice =
+          opBuilder.create<ModuleOp>(deviceOps[i].getLoc());
+      opBuilder.setInsertionPointToStart(moduleWithOneDevice.getBody());
+      Operation *repl = opBuilder.clone(*deviceOps[i].getOperation());
+      deviceOps[i] = cast<xilinx::AIE::DeviceOp>(repl);
+    }
 
     if (failed(aie2xclbin(
-            /*ctx=*/variantOp->getContext(), /*moduleOp=*/*owningModuleOp,
+            /*ctx=*/variantOp->getContext(), deviceOps[i],
             /*outputNPU=*/npuInstPath.str().str(),
             /*outputXCLBin=*/xclbinPath.str().str(),
             /*printIRBeforeAll=*/options.aie2xclbinPrintIrBeforeAll,
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetBCF.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetBCF.cpp
index 48fc13527..33a1567dc 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetBCF.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetBCF.cpp
@@ -7,7 +7,6 @@
 #include "AMDAIETargets.h"
 #include "aie/AIEDialect.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/IR/Module.h"
 
 using namespace mlir;
 using namespace xilinx;
@@ -17,15 +16,11 @@ std::string utohexstr(uint32_t u) { return "0x" + llvm::utohexstr(u); }
 
 namespace mlir::iree_compiler::AMDAIE {
 
-LogicalResult AIETranslateToBCF(ModuleOp module, raw_ostream &output,
+LogicalResult AIETranslateToBCF(DeviceOp deviceOp, raw_ostream &output,
                                 int tileCol, int tileRow) {
   DenseMap<TileLoc, Operation *> tiles;
   DenseMap<Operation *, SmallVector<BufferOp, 4>> buffers;
 
-  if (module.getOps<DeviceOp>().empty())
-    module.emitOpError("expected aie.device operation at toplevel");
-  DeviceOp deviceOp = *(module.getOps<DeviceOp>().begin());
-
   collectTiles(deviceOp, tiles);
   collectBuffers(deviceOp, buffers);
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetCDODirect.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetCDODirect.cpp
index 7678f848a..29216d069 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetCDODirect.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetCDODirect.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include <cassert>
-#include <cstdint>  // uint
+#include <cstdint>
 #include <filesystem>
 #include <optional>
 #include <string>
@@ -17,12 +17,9 @@
 #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "mlir/IR/Block.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 
@@ -341,14 +338,11 @@ LogicalResult generateCDOBinariesSeparately(
   return success();
 }
 
-LogicalResult AIETranslateToCDODirect(ModuleOp m, llvm::StringRef workDirPath,
+LogicalResult AIETranslateToCDODirect(xilinx::AIE::DeviceOp device,
+                                      llvm::StringRef workDirPath,
                                       bool bigEndian, bool emitUnified,
                                       bool cdoDebug, bool aieSim,
                                       bool enableCores) {
-  auto devOps = m.getOps<DeviceOp>();
-  assert(llvm::range_size(devOps) == 1 &&
-         "only exactly 1 device op supported.");
-  DeviceOp device = *devOps.begin();
   AMDAIEDeviceModel deviceModel = getDeviceModel(device.getDevice());
   byte_ordering endianness =
       bigEndian ? byte_ordering::Big_Endian : byte_ordering::Little_Endian;
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetLdScript.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetLdScript.cpp
index a106f1e53..5cbebf39e 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetLdScript.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetLdScript.cpp
@@ -50,14 +50,10 @@ static void writeLDScriptMap(raw_ostream &output, BufferOp buf, int offset) {
 //   .bss : { *(.bss) } > data
 // }
 LogicalResult mlir::iree_compiler::AMDAIE::AIETranslateToLdScript(
-    ModuleOp module, raw_ostream &output, int tileCol, int tileRow) {
+    DeviceOp deviceOp, raw_ostream &output, int tileCol, int tileRow) {
   DenseMap<TileLoc, Operation *> tiles;
   DenseMap<Operation *, SmallVector<BufferOp, 4>> buffers;
 
-  if (module.getOps<DeviceOp>().empty()) {
-    module.emitOpError("expected AIE.device operation at toplevel");
-  }
-  DeviceOp deviceOp = *(module.getOps<DeviceOp>().begin());
 
   collectTiles(deviceOp, tiles);
   ::collectBuffers(deviceOp, buffers);
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargets.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargets.h
index 5052fadd8..90a16e72a 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargets.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargets.h
@@ -17,16 +17,16 @@
 namespace mlir::iree_compiler::AMDAIE {
 std::vector<uint32_t> AIETranslateToNPU(mlir::ModuleOp);
 
-mlir::LogicalResult AIETranslateToLdScript(mlir::ModuleOp module,
+mlir::LogicalResult AIETranslateToLdScript(xilinx::AIE::DeviceOp,
                                            llvm::raw_ostream &output,
                                            int tileCol, int tileRow);
 
-mlir::LogicalResult AIETranslateToBCF(mlir::ModuleOp module,
+mlir::LogicalResult AIETranslateToBCF(xilinx::AIE::DeviceOp,
                                       llvm::raw_ostream &output, int tileCol,
                                       int tileRow);
 
 mlir::LogicalResult AIETranslateToCDODirect(
-    mlir::ModuleOp m, llvm::StringRef workDirPath, bool bigEndian = false,
+    xilinx::AIE::DeviceOp, llvm::StringRef workDirPath, bool bigEndian = false,
     bool emitUnified = false, bool cdoDebug = false, bool aieSim = false,
     bool enableCores = true);
 }  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp
index f2bdc6a33..b7c2b1578 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp
@@ -18,6 +18,8 @@
 #include "aievec/Passes.h"
 #include "iree-amd-aie/Transforms/Passes.h"
 #include "iree/compiler/Utils/ToolUtils.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/JSON.h"
@@ -26,6 +28,7 @@
 #include "llvm/Support/Program.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "mlir/IR/AsmState.h"
+#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Support/FileUtilities.h"
@@ -356,12 +359,12 @@ static std::optional<std::string> runTool(
                  << "\n";
     return {};
   }
-  auto outputFromFile = maybeOutputFromFile.value();
+  const std::string &outputFromFile = maybeOutputFromFile.value();
 
   if (verbose) {
-    auto totalTime = std::chrono::duration_cast<std::chrono::duration<float>>(
-                         stats.TotalTime)
-                         .count();
+    float totalTime = std::chrono::duration_cast<std::chrono::duration<float>>(
+                          stats.TotalTime)
+                          .count();
     std::string exitStatusStr = result == 0 ? "Succeeded" : "Failed";
     llvm::outs() << "\n"
                  << exitStatusStr << " in totalTime " << totalTime
@@ -432,7 +435,7 @@ static LogicalResult assembleFileUsingPeano(
   args.emplace_back("--target=aie2-none-unknown-elf");
   std::vector<std::string> peanoArgs = makePeanoOptArgs();
   args.reserve(args.size() + peanoArgs.size());
-  for (const auto &item : peanoArgs) {
+  for (const std::string &item : peanoArgs) {
     args.emplace_back("-mllvm");
     args.emplace_back(item);
   }
@@ -498,19 +501,13 @@ static_assert(std::is_same_v<decltype(assembleStringUsingChess),
 
 // Generate the elf files for the core
 static LogicalResult generateCoreElfFiles(
-    ModuleOp moduleOp, const std::string &objFile, Path tempDir, bool useChess,
-    std::optional<Path> vitisDir, const std::string &targetArch, bool verbose,
-    Path peanoDir, const std::optional<std::string> &ukernel) {
-  auto deviceOps = moduleOp.getOps<AIE::DeviceOp>();
-  if (!llvm::hasSingleElement(deviceOps))
-    return moduleOp.emitOpError("expected a single device op");
-
-  AIE::DeviceOp deviceOp = *deviceOps.begin();
+    AIE::DeviceOp deviceOp, const std::string &objFile, Path tempDir,
+    bool useChess, std::optional<Path> vitisDir, const std::string &targetArch,
+    bool verbose, Path peanoDir, const std::optional<std::string> &ukernel) {
   auto tileOps = deviceOp.getOps<AIE::TileOp>();
-
   std::string errorMessage;
 
-  for (auto tileOp : tileOps) {
+  for (AIE::TileOp tileOp : tileOps) {
     int col = tileOp.getCol();
     int row = tileOp.getRow();
     auto coreOp = AIE::getCoreOp(tileOp);
@@ -580,7 +577,7 @@ static LogicalResult generateCoreElfFiles(
         }
 
         if (failed(mlir::iree_compiler::AMDAIE::AIETranslateToBCF(
-                moduleOp, bcfOutput->os(), col, row))) {
+                deviceOp, bcfOutput->os(), col, row))) {
           llvm::errs() << "Failed to generate BCF";
           return failure();
         }
@@ -614,7 +611,7 @@ static LogicalResult generateCoreElfFiles(
           return failure();
         }
         if (failed(mlir::iree_compiler::AMDAIE::AIETranslateToLdScript(
-                moduleOp, ldscriptOutput->os(), col, row))) {
+                deviceOp, ldscriptOutput->os(), col, row))) {
           llvm::errs() << "failed to generate ld script for core (" << col
                        << "," << row << ")";
           return failure();
@@ -646,24 +643,28 @@ static LogicalResult generateCoreElfFiles(
   return success();
 }
 
-static LogicalResult generateCDO(MLIRContext *context, ModuleOp moduleOp,
+static LogicalResult generateCDO(MLIRContext *context, AIE::DeviceOp deviceOp,
                                  bool printIRBeforeAll, bool printIRAfterAll,
                                  bool printIRModuleScope, bool timing,
                                  const Path &tempDir) {
-  ModuleOp copy = moduleOp.clone();
+
+  auto copy = cast<ModuleOp>(deviceOp.getParentOp()->clone());
+  deviceOp = *copy.getOps<AIE::DeviceOp>().begin();
+
   std::string errorMessage;
-  PassManager passManager(context, ModuleOp::getOperationName());
+  PassManager passManager(context, AIE::DeviceOp::getOperationName());
   applyConfigToPassManager(passManager, printIRBeforeAll, printIRAfterAll,
                            printIRModuleScope, timing);
-  passManager.addNestedPass<AIE::DeviceOp>(
+  passManager.addPass(
       mlir::iree_compiler::AMDAIE::createAMDAIEPathfinderPass());
-  if (failed(passManager.run(copy))) {
+
+  if (failed(passManager.run(deviceOp))) {
     llvm::errs() << "failed to run passes to prepare for XCLBin generation";
     return failure();
   }
 
   if (failed(mlir::iree_compiler::AMDAIE::AIETranslateToCDODirect(
-          copy, tempDir.string()))) {
+          deviceOp, tempDir.string()))) {
     llvm::errs() << "failed to emit CDO";
     return failure();
   }
@@ -1029,17 +1030,22 @@ struct RemoveAlignment2FromLLVMLoadPass
 }  // namespace
 
 static LogicalResult generateUnifiedObject(
-    MLIRContext *context, ModuleOp moduleOp, const std::string &outputFile,
+    MLIRContext *context, AIE::DeviceOp deviceOp, const std::string &outputFile,
     bool printIRBeforeAll, bool printIRAfterAll, bool printIRModuleScope,
     bool timing, bool useChess, bool verbose, Path tempDir,
     std::optional<Path> vitisDir, const std::string &targetArch,
     Path peanoDir) {
-  PassManager pm(context, moduleOp.getOperationName());
+  assert(deviceOp->getParentOp() && isa<ModuleOp>(deviceOp->getParentOp()) &&
+         "DeviceOp must be in a module parent");
+
+  ModuleOp moduleOpCopy = cast<ModuleOp>(deviceOp->getParentOp()).clone();
+
+  PassManager pm(context, moduleOpCopy.getOperationName());
   applyConfigToPassManager(pm, printIRBeforeAll, printIRAfterAll,
                            printIRModuleScope, timing);
 
   pm.addPass(mlir::iree_compiler::AMDAIE::createAMDAIECoreToStandardPass());
-  pm.addPass(mlir::iree_compiler::AMDAIE::createAMDAIEXToStandardPass());
+
   // Convert specific vector dialect ops (like vector.contract) to the AIEVec
   // dialect
   mlir::iree_compiler::aievec::buildConvertVectorToAIEVec(pm);
@@ -1052,14 +1058,15 @@ static LogicalResult generateUnifiedObject(
     llvm::outs() << "\n";
   }
 
-  ModuleOp copy = moduleOp.clone();
-  if (failed(pm.run(copy)))
-    return moduleOp.emitOpError("Failed to lower to LLVM");
+  if (failed(pm.run(moduleOpCopy)))
+    return deviceOp.emitOpError("Failed to lower to LLVM");
 
   llvm::LLVMContext llvmContext;
-  auto llvmModule = translateModuleToLLVMIR(copy, llvmContext);
-  if (!llvmModule)
-    return moduleOp.emitOpError("Failed to translate module to LLVMIR");
+  auto llvmModule = translateModuleToLLVMIR(moduleOpCopy, llvmContext);
+  if (!llvmModule) {
+    return deviceOp.emitOpError("Failed to translate module to LLVMIR");
+  }
+
   std::string inputLLStr;
   {
     llvm::raw_string_ostream rso(inputLLStr);
@@ -1081,7 +1088,9 @@ static LogicalResult generateUnifiedObject(
         /*workDir=*/tempDir,
         /*vitisDir=*/*maybeVitisDir,
         /*verbose=*/verbose);
-    if (failed(chessIntrinsicsObjFile)) return failure();
+    if (failed(chessIntrinsicsObjFile)) {
+      return failure();
+    }
   } else {
     Path LLVMIRFile = tempDir / "input.ll";
     if (auto maybeErr = dumpStrToDisk(inputLLStr, LLVMIRFile.string());
@@ -1116,12 +1125,37 @@ static LogicalResult generateUnifiedObject(
       return failure();
     }
   }
-  copy->erase();
+
+  moduleOpCopy->erase();
   return success();
 }
 
+FailureOr<ArrayRef<uint32_t>> getNpuInstructions(AIE::DeviceOp deviceOp) {
+  MLIRContext *ctx = deviceOp.getContext();
+  mlir::Attribute maybeNpuInstructions = deviceOp->getAttr("npu_instructions");
+  if (!maybeNpuInstructions) {
+    return emitError(UnknownLoc::get(ctx),
+                     "Expected npu_instructions attribute on aie.device");
+  }
+  auto npuInstructions =
+      dyn_cast<DenseUI32ResourceElementsAttr>(maybeNpuInstructions);
+  if (!npuInstructions) {
+    return emitError(
+        UnknownLoc::get(ctx),
+        "Failed to cast npu_instructions to DenseUI32ResourceElementsAttr");
+  }
+  std::optional<ArrayRef<uint32_t>> maybeArrayRef =
+      npuInstructions.tryGetAsArrayRef();
+  if (!maybeArrayRef.has_value()) {
+    return emitError(
+        UnknownLoc::get(ctx),
+        "Failed getting values for npu_instructions in tryGetAsArrayRef");
+  }
+  return maybeArrayRef.value();
+}
+
 LogicalResult aie2xclbin(
-    MLIRContext *ctx, ModuleOp moduleOp, const std::string &outputNPU,
+    MLIRContext *ctx, AIE::DeviceOp deviceOp, const std::string &outputNPU,
     const std::string &outputXCLBin, bool printIRBeforeAll,
     bool printIRAfterAll, bool printIRModuleScope, bool timing,
     const std::string &tempDir, bool useChess, bool verbose,
@@ -1131,22 +1165,19 @@ LogicalResult aie2xclbin(
     const std::string &amdAIEInstallDir,
     const std::optional<std::string> &InputXCLBin,
     const std::optional<std::string> &ukernel) {
-  PassManager pm(ctx, mlir::ModuleOp::getOperationName());
+  PassManager pm(ctx, AIE::DeviceOp::getOperationName());
   applyConfigToPassManager(pm, printIRBeforeAll, printIRAfterAll,
                            printIRModuleScope, timing);
-  // generateNPUInstructions
-  pm.addNestedPass<AIE::DeviceOp>(
-      mlir::iree_compiler::AMDAIE::createAMDAIEDmaToNpuPass());
-  if (failed(pm.run(moduleOp)))
-    return moduleOp.emitOpError(": NPU Instruction pipeline failed");
-
-  std::optional<ArrayRef<uint32_t>> npuInstructions =
-      cast<DenseUI32ResourceElementsAttr>(
-          (*moduleOp.getOps<xilinx::AIE::DeviceOp>().begin())
-              ->getAttr("npu_instructions"))
-          .tryGetAsArrayRef();
-  if (!npuInstructions)
-    return moduleOp.emitOpError(": No NPU instructions in device op");
+  if (failed(pm.run(deviceOp)))
+    return deviceOp.emitOpError(": NPU Instruction pipeline failed");
+
+  FailureOr<ArrayRef<uint32_t>> maybeNpuInstructions =
+      getNpuInstructions(deviceOp);
+  if (failed(maybeNpuInstructions)) {
+    assert(false && "Failed to get NPU instructions");
+    return failure();
+  }
+  ArrayRef<uint32_t> npuInstructions = maybeNpuInstructions.value();
 
   std::string errorMessage;
   auto output = openOutputFile(outputNPU, &errorMessage);
@@ -1155,29 +1186,29 @@ LogicalResult aie2xclbin(
                  << errorMessage;
     return failure();
   }
-  for (auto w : *npuInstructions) output->os() << llvm::format("%08X\n", w);
+  for (uint32_t w : npuInstructions) output->os() << llvm::format("%08X\n", w);
   output->keep();
 
   Path unifiedObj = Path(tempDir) / "input.o";
   if (failed(generateUnifiedObject(
-          ctx, moduleOp, unifiedObj.string(), printIRBeforeAll, printIRAfterAll,
+          ctx, deviceOp, unifiedObj.string(), printIRBeforeAll, printIRAfterAll,
           printIRModuleScope, timing, useChess, verbose, tempDir, vitisDir,
           targetArch, peanoDir)))
-    return moduleOp.emitOpError("Failed to generate unified object");
+    return deviceOp.emitOpError("Failed to generate unified object");
 
-  if (failed(generateCoreElfFiles(moduleOp, unifiedObj.string(), tempDir,
+  if (failed(generateCoreElfFiles(deviceOp, unifiedObj.string(), tempDir,
                                   useChess, vitisDir, targetArch, verbose,
                                   peanoDir, ukernel)))
-    return moduleOp.emitOpError("Failed to generate core ELF file(s)");
+    return deviceOp.emitOpError("Failed to generate core ELF file(s)");
 
-  if (failed(generateCDO(ctx, moduleOp, printIRBeforeAll, printIRAfterAll,
+  if (failed(generateCDO(ctx, deviceOp, printIRBeforeAll, printIRAfterAll,
                          printIRModuleScope, timing, tempDir)))
-    return moduleOp.emitOpError("Failed to generate CDO");
+    return deviceOp.emitOpError("Failed to generate CDO");
 
   if (failed(generateXCLBin(outputXCLBin, tempDir, xclBinKernelID,
                             xclBinKernelName, xclBinInstanceName,
                             amdAIEInstallDir, verbose, InputXCLBin)))
-    return moduleOp.emitOpError("Failed to generate XCLBin");
+    return deviceOp.emitOpError("Failed to generate XCLBin");
 
   return success();
 }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.h
index 705e97d4f..290064170 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.h
@@ -7,17 +7,15 @@
 
 #include <string>
 
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringRef.h"
-#include "mlir/IR/BuiltinOps.h"
+#include "aie/AIEDialect.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LogicalResult.h"
 
 mlir::LogicalResult aie2xclbin(
-    mlir::MLIRContext *ctx, mlir::ModuleOp moduleOp,
-    const std::string &outputNPU, const std::string &outputXCLBin,
-    bool printIRBeforeAll, bool printIRAfterAll, bool printIRModuleScope,
-    bool timing, const std::string &tempDir, bool useChess, bool verbose,
+    mlir::MLIRContext *ctx, xilinx::AIE::DeviceOp, const std::string &outputNPU,
+    const std::string &outputXCLBin, bool printIRBeforeAll,
+    bool printIRAfterAll, bool printIRModuleScope, bool timing,
+    const std::string &tempDir, bool useChess, bool verbose,
     const std::optional<std::string> &vitisDir, const std::string &targetArch,
     const std::string &peanoDir, const std::string &xclBinKernelID,
     const std::string &xclBinKernelName, const std::string &xclBinInstanceName,
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/aie_cdo_gen_test.cxx b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/aie_cdo_gen_test.cxx
index fcc0d39d7..7ea4b8269 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/aie_cdo_gen_test.cxx
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/aie_cdo_gen_test.cxx
@@ -10,10 +10,7 @@
 #include "aie/AIEDialect.h"
 #include "aie/AIEXDialect.h"
 #include "iree-amd-aie/Target/AMDAIETargets.h"
-#include "iree-amd-aie/Target/XCLBinGen.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Path.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Parser/Parser.h"
 
@@ -43,11 +40,19 @@ int main(int argc, char **argv) {
   mlir::ParserConfig parserConfig(&context);
   auto moduleOp = llvm::cast<ModuleOp>(
       mlir::parseSourceFile(mlirAbsPath, parserConfig).release());
+
+  auto deviceOps = moduleOp.getOps<xilinx::AIE::DeviceOp>();
+  auto nDeviceOps = std::distance(deviceOps.begin(), deviceOps.end());
+  if (nDeviceOps != 1){
+    std::cerr << "Error: Expected exactly one xilinx.aie.device op\n";
+    return 1;
+  }
+  auto  deviceOp = *deviceOps.begin();
   llvm::DebugFlag = true;
   const char *debugTypes[3] = {"aie-generate-cdo", "iree-aie-runtime",
                                "iree-aie-cdo-emitter"};
   llvm::setCurrentDebugTypes(debugTypes, 3);
-  auto status = AIETranslateToCDODirect(moduleOp, workDir, false, false, false);
+  auto status = AIETranslateToCDODirect(deviceOp, workDir, false, false, false);
   std::vector<std::string> diagnostics;
   ScopedDiagnosticHandler handler(moduleOp.getContext(), [&](Diagnostic &d) {
     llvm::raw_string_ostream(diagnostics.emplace_back())
@@ -59,7 +64,7 @@ int main(int argc, char **argv) {
 
   llvm::DebugFlag = false;
   llvm::setCurrentDebugType("aie-cdo-driver-debug");
-  status = AIETranslateToCDODirect(moduleOp, workDir, false, false, true);
+  status = AIETranslateToCDODirect(deviceOp, workDir, false, false, true);
   if (failed(status))
     for (const auto &diagnostic : diagnostics) std::cerr << diagnostic << "\n";
 }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
index b4a0e502d..a1bd27d91 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -621,8 +621,6 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) {
   passManager.addPass(createAMDAIELowerToAIEPass());
   passManager.addPass(createCanonicalizerPass());
 
-  passManager.addPass(createConvertLinalgToLoopsPass());
-
   // Now lower using the AIE passes from MLIR-AIE.
   addMLIRAIELoweringPasses(passManager);
 }
@@ -788,18 +786,28 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device) {
 }
 
 void addMLIRAIELoweringPasses(OpPassManager &passManager) {
+  {
+    OpPassManager &devicePM = passManager.nest<xilinx::AIE::DeviceOp>();
+    devicePM.addPass(createAMDAIEObjectFifoStatefulTransformPass());
+    devicePM.addPass(createCanonicalizerPass());
+    devicePM.addPass(createAMDAIEDmaToNpuPass());
+    devicePM.addPass(createAMDAIEAssignLockIDsPass());
+    devicePM.addPass(createAMDAIEAssignBufferDescriptorIDsPass());
+    devicePM.addPass(createAMDAIEAssignBufferAddressesBasicPass());
+    devicePM.addPass(createAMDAIEPathfinderPass());
+  }
+
+  passManager.addPass(createCanonicalizerPass());
+  passManager.addPass(createConvertLinalgToLoopsPass());
   passManager.addPass(createLowerAffinePass());
-  OpPassManager &devicePM = passManager.nest<xilinx::AIE::DeviceOp>();
-  devicePM.addPass(createAMDAIEAssignLockIDsPass());
-  devicePM.addPass(createAMDAIEObjectFifoStatefulTransformPass());
-  devicePM.addPass(createAMDAIEAssignBufferDescriptorIDsPass());
-  devicePM.addPass(createAMDAIEAssignBufferAddressesBasicPass());
-  devicePM.addPass(createAMDAIEPathfinderPass());
   passManager.addPass(createConvertSCFToCFPass());
-  passManager.addNestedPass<xilinx::AIE::DeviceOp>(
-      createAMDAIELocalizeLocksPass());
-  passManager.addNestedPass<xilinx::AIE::DeviceOp>(
-      createAMDAIENormalizeAddressSpacesPass());
+
+  {
+    OpPassManager &devicePM = passManager.nest<xilinx::AIE::DeviceOp>();
+    devicePM.addPass(createAMDAIELocalizeLocksPass());
+    devicePM.addPass(createAMDAIENormalizeAddressSpacesPass());
+    devicePM.addPass(createCanonicalizerPass());
+  }
 }
 
 // NOTE: this runs on the top-level program module containing all hal.executable
diff --git a/tests/samples/conv_pipeline_e2e.mlir b/tests/samples/conv_pipeline_e2e.mlir
index 71b1442b8..7c6957017 100644
--- a/tests/samples/conv_pipeline_e2e.mlir
+++ b/tests/samples/conv_pipeline_e2e.mlir
@@ -13,11 +13,6 @@ func.func @conv_2d_nhwc_hwcf(%arg0: tensor<2x14x14x32xi32>, %arg1: tensor<3x3x32
 //       CHECK:    aie.shim_dma_allocation
 //       CHECK:    aie.shim_dma_allocation
 //       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aiex.runtime_sequence @conv_2d_nhwc_hwcf_dispatch_0_conv_2d_nhwc_hwcf_2x12x12x64x3x3x32_i32(%arg0: memref<2x14x14x32xi32>, %arg1: memref<3x3x32x64xi32>, %arg2: memref<2x12x12x64xi32>)
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.sync
 
 // -----
 
@@ -34,8 +29,3 @@ func.func @conv_2d_nhwc_hwcf_q(%arg0: tensor<2x14x14x32xi8>, %arg1: tensor<3x3x3
 //       CHECK:    aie.shim_dma_allocation
 //       CHECK:    aie.shim_dma_allocation
 //       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aiex.runtime_sequence @conv_2d_nhwc_hwcf_q_dispatch_0_conv_2d_nhwc_hwcf_q_2x12x12x64x3x3x32_i8xi8xi32xi32xi32(%arg0: memref<3136xi32>, %arg1: memref<4608xi32>, %arg2: memref<2x12x12x64xi32>)
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.sync
diff --git a/tests/samples/matmul_peeled_objectfifo_e2e.mlir b/tests/samples/matmul_peeled_objectfifo_e2e.mlir
index 386214f58..484494045 100644
--- a/tests/samples/matmul_peeled_objectfifo_e2e.mlir
+++ b/tests/samples/matmul_peeled_objectfifo_e2e.mlir
@@ -1,6 +1,8 @@
 // RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources --iree-amdaie-target-device=npu1_4col %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-lower-to-aie-pipeline=objectFifo --iree-amdaie-tile-pipeline=pack-peel --split-input-file | FileCheck %s
 
 // CHECK-LABEL: hal.executable.export public @matmul_i32_dispatch_0_matmul_128x128x256_i32
+
+// CHECK:       aie.device(npu1_4col) {
 // CHECK-DAG:   %[[TILE_0_2:.+]] = aie.tile(0, 2)
 // CHECK-DAG:   %[[TILE_0_3:.+]] = aie.tile(0, 3)
 // CHECK-DAG:   %[[TILE_1_2:.+]] = aie.tile(1, 2)
@@ -11,21 +13,16 @@
 // CHECK-DAG:   aie.core(%[[TILE_1_2]])
 // CHECK-DAG:   aie.core(%[[TILE_0_3]])
 // CHECK-DAG:   aie.core(%[[TILE_1_3]])
-// CHECK-DAG:   aiex.runtime_sequence @matmul_i32_dispatch_0_matmul_128x128x256_i32(%[[ARG0:.+]]: memref<128x256xi32>, %[[ARG1:.+]]: memref<256x128xi32>, %[[ARG2:.+]]: memref<128x128xi32>)
-// CHECK-DAG:     aiex.npu.dma_memcpy_nd(0, 0, %[[ARG2]][0, 0, 0, 0][2, 2, 64, 64][8192, 64, 128, 1]) {id = 0 : i64, issue_token = true, metadata = @[[OBJ10:.+]]}
-// CHECK-DAG:     aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 8, 64, 32][0, 32, 256, 1]) {id = 1 : i64, issue_token = true, metadata = @[[OBJ0:.+]]}
-// CHECK-DAG:     aiex.npu.dma_wait {symbol = @[[OBJ0]]}
-// CHECK-DAG:     aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][8, 2, 32, 32][4096, 32, 128, 1]) {id = 2 : i64, issue_token = true, metadata = @[[OBJ1:.+]]}
-// CHECK-DAG:     aiex.npu.dma_wait {symbol = @[[OBJ1]]}
-// CHECK-DAG:     aiex.npu.dma_wait {symbol = @[[OBJ10]]}
-// CHECK-DAG:   aie.shim_dma_allocation @[[OBJ0]](MM2S, 0, 0)
-// CHECK-DAG:   aie.shim_dma_allocation @[[OBJ1]](MM2S, 1, 0)
+// CHECK-DAG:   aie.shim_dma_allocation {{.*}}(MM2S, 0, 0)
+// CHECK-DAG:   aie.shim_dma_allocation {{.*}}(MM2S, 1, 0)
 // CHECK-DAG:   aie.memtile_dma(%[[TILE_0_1]])
 // CHECK-DAG:   aie.mem(%[[TILE_0_2]])
 // CHECK-DAG:   aie.mem(%[[TILE_0_3]])
 // CHECK-DAG:   aie.mem(%[[TILE_1_2]])
 // CHECK-DAG:   aie.mem(%[[TILE_1_3]])
-// CHECK-DAG:   aie.shim_dma_allocation @[[OBJ10]](S2MM, 0, 0)
+// CHECK-DAG:   aie.shim_dma_allocation {{.*}}(S2MM, 0, 0)
+// CHECK:       {npu_instructions = 
+// CHECK-SAME:   runtime_sequence_name = "matmul_i32_dispatch_0_matmul_128x128x256_i32"
 func.func @matmul_i32(%lhs: tensor<128x256xi32>, %rhs: tensor<256x128xi32>) -> tensor<128x128xi32>
 {
   %cst = arith.constant 0 : i32
diff --git a/tests/samples/pack_peel_pipeline_matmul.mlir b/tests/samples/pack_peel_pipeline_matmul.mlir
index 344c34e5d..a626a2132 100644
--- a/tests/samples/pack_peel_pipeline_matmul.mlir
+++ b/tests/samples/pack_peel_pipeline_matmul.mlir
@@ -15,11 +15,6 @@ func.func @matmul_i8_i32(%lhs: tensor<32x16xi8>, %rhs: tensor<16x32xi8>) -> tens
 //       CHECK:    aie.shim_dma_allocation
 //       CHECK:    aie.shim_dma_allocation
 //       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aiex.runtime_sequence @matmul_i8_i32_dispatch_0_matmul_32x32x16_i8xi8xi32(%arg0: memref<128xi32>, %arg1: memref<128xi32>, %arg2: memref<32x32xi32>)
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.sync
 
 // -----
 
@@ -38,8 +33,3 @@ func.func @matmul_bf16(%lhs: tensor<16x32xbf16>, %rhs: tensor<32x16xbf16>) -> te
 //       CHECK:    aie.shim_dma_allocation
 //       CHECK:    aie.shim_dma_allocation
 //       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aiex.runtime_sequence @matmul_bf16_dispatch_0_matmul_16x16x32_bf16(%arg0: memref<256xi32>, %arg1: memref<256xi32>, %arg2: memref<128xi32>)
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.sync
diff --git a/tests/samples/pack_peel_pipeline_matmul_elementwise.mlir b/tests/samples/pack_peel_pipeline_matmul_elementwise.mlir
index 9c2cbf935..c99b3b269 100644
--- a/tests/samples/pack_peel_pipeline_matmul_elementwise.mlir
+++ b/tests/samples/pack_peel_pipeline_matmul_elementwise.mlir
@@ -20,15 +20,8 @@ func.func @matmul_elementwise_i32(%lhs: tensor<1024x512xi32>, %rhs: tensor<512x1
 }
 
 // CHECK-LABEL: hal.executable.export public @matmul_elementwise_i32_dispatch_0_matmul_1024x1024x512_i32
-//       CHECK:    aie.device(npu1_4col)
-//       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aiex.runtime_sequence @matmul_elementwise_i32_dispatch_0_matmul_1024x1024x512_i32(%arg0: memref<1024x512xi32>, %arg1: memref<512x1024xi32>, %arg2: memref<1024x1024xi32>, %arg3: memref<1024x1024xi32>)
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.sync
+// CHECK:           aie.device(npu1_4col)
+// CHECK-COUNT-3:   aie.shim_dma_allocation
 
 // -----
 
@@ -52,15 +45,8 @@ func.func @matmul_elementwise_bf16_f32(%arg0: tensor<1024x512xbf16>, %arg1: tens
 }
 
 // CHECK-LABEL: hal.executable.export public @matmul_elementwise_bf16_f32_dispatch_0_matmul_1024x1024x512_bf16xbf16xf32
-//       CHECK:    aie.device(npu1_4col)
-//       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aiex.runtime_sequence @matmul_elementwise_bf16_f32_dispatch_0_matmul_1024x1024x512_bf16xbf16xf32(%arg0: memref<262144xi32>, %arg1: memref<262144xi32>, %arg2: memref<1024xf32>, %arg3: memref<1024x1024xf32>)
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.sync
+// CHECK:           aie.device(npu1_4col)
+// CHECK-COUNT-3:   aie.shim_dma_allocation
 
 // -----
 func.func @matmul_elementwise_bf16(%arg0: tensor<512x512xbf16>, %arg1: tensor<512x16384xbf16>, %arg2: tensor<512xf32>) -> tensor<512x16384xbf16> {
@@ -78,12 +64,6 @@ func.func @matmul_elementwise_bf16(%arg0: tensor<512x512xbf16>, %arg1: tensor<51
   return %11 : tensor<512x16384xbf16>
 }
 
-// CHECK-LABEL: hal.executable.export public @matmul_elementwise_bf16_dispatch_0_matmul_512x16384x512_bf16xbf16xf32
-//       CHECK:    aie.device(npu1_4col)
-//       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aie.shim_dma_allocation
-//       CHECK:    aiex.runtime_sequence @matmul_elementwise_bf16_dispatch_0_matmul_512x16384x512_bf16xbf16xf32(%arg0: memref<131072xi32>, %arg1: memref<4194304xi32>, %arg2: memref<512xf32>, %arg3: memref<4194304xi32>)
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
-//       CHECK:      aiex.npu.dma_memcpy_nd
+// CHECK-LABEL:    hal.executable.export public @matmul_elementwise_bf16_dispatch_0_matmul_512x16384x512_bf16xbf16xf32
+// CHECK:          aie.device(npu1_4col)
+// CHECK-COUNT-3:  aie.shim_dma_allocation
diff --git a/tests/samples/pad_pack_pipeline_e2e.mlir b/tests/samples/pad_pack_pipeline_e2e.mlir
index 18d9d8708..14bdcb04c 100644
--- a/tests/samples/pad_pack_pipeline_e2e.mlir
+++ b/tests/samples/pad_pack_pipeline_e2e.mlir
@@ -7,11 +7,6 @@
 //       CPP:    aie.shim_dma_allocation
 //       CPP:    aie.shim_dma_allocation
 //       CPP:    aie.shim_dma_allocation
-//       CPP:    aiex.runtime_sequence @matmul_small_dispatch_0_matmul_8x32x16_i32(%arg0: memref<8x16xi32>, %arg1: memref<16x32xi32>, %arg2: memref<8x32xi32>)
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.sync
 func.func @matmul_small(%lhs : tensor<8x16xi32>,
     %rhs : tensor<16x32xi32>) -> tensor<8x32xi32> {
   %empty = tensor.empty() : tensor<8x32xi32>
@@ -29,12 +24,6 @@ func.func @matmul_small(%lhs : tensor<8x16xi32>,
 //       CPP:    aie.shim_dma_allocation
 //       CPP:    aie.shim_dma_allocation
 //       CPP:    aie.shim_dma_allocation
-//       CPP:    aiex.runtime_sequence @matmul_large_dispatch_0_matmul_2048x2048x2048_i32(%arg0: memref<2048x2048xi32>, %arg1: memref<2048x2048xi32>, %arg2: memref<2048x2048xi32>)
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.sync
-
 func.func @matmul_large(%lhs: tensor<2048x2048xi32>, %rhs: tensor<2048x2048xi32>) -> tensor<2048x2048xi32> {
   %empty = tensor.empty() : tensor<2048x2048xi32>
   %cst = arith.constant 0 : i32
@@ -54,11 +43,6 @@ func.func @matmul_large(%lhs: tensor<2048x2048xi32>, %rhs: tensor<2048x2048xi32>
 //       CPP:    aie.shim_dma_allocation
 //       CPP:    aie.shim_dma_allocation
 //       CPP:    aie.shim_dma_allocation
-//       CPP:    aiex.runtime_sequence @generic_matmul_transpose_static_dispatch_0_matmul_like_8x32x16_i32(%arg0: memref<8x16xi32>, %arg1: memref<32x16xi32>, %arg2: memref<8x32xi32>)
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.sync
 func.func @generic_matmul_transpose_static(%lhs : tensor<8x16xi32>,
     %rhs : tensor<32x16xi32>) -> tensor<8x32xi32> {
   %cst = arith.constant 0 : i32
@@ -82,11 +66,6 @@ func.func @generic_matmul_transpose_static(%lhs : tensor<8x16xi32>,
 //       CPP:    aie.shim_dma_allocation
 //       CPP:    aie.shim_dma_allocation
 //       CPP:    aie.shim_dma_allocation
-//       CPP:    aiex.runtime_sequence @matmul_transpose_b_static_dispatch_0_matmul_transpose_b_8x32x16_i32(%arg0: memref<8x16xi32>, %arg1: memref<32x16xi32>, %arg2: memref<8x32xi32>)
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.dma_memcpy_nd
-//       CPP:      aiex.npu.sync
 func.func @matmul_transpose_b_static(%lhs : tensor<8x16xi32>,
     %rhs : tensor<32x16xi32>) -> tensor<8x32xi32> {
   %cst = arith.constant 0 : i32