diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDma.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDma.cpp
index d12454afa..c5e9efc30 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDma.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDma.cpp
@@ -13,7 +13,7 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
-#define DEBUG_TYPE "iree-amdaie-pack-to-dma"
+#define DEBUG_TYPE "iree-amdaie-canonicalize-dma"
 
 namespace mlir::iree_compiler::AMDAIE {
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDoublyStridedOp.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDoublyStridedOp.cpp
index b693e4b0d..e6b919eb0 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDoublyStridedOp.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDoublyStridedOp.cpp
@@ -5,7 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "iree-amd-aie/IR/AMDAIEDialect.h"
-#include "iree-amd-aie/IR/AMDAIEOps.h"
 #include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h"
 #include "iree-amd-aie/Transforms/Passes.h"
 #include "mlir/Pass/Pass.h"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeNpuDmaCpyNd.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeNpuDmaCpyNd.cpp
new file mode 100644
index 000000000..bd620152b
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeNpuDmaCpyNd.cpp
@@ -0,0 +1,191 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <memory>
+
+#include "iree-amd-aie/IR/AMDAIEDialect.h"
+#include "iree-amd-aie/IR/AMDAIEOps.h"
+#include "iree-amd-aie/Transforms/Passes.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+
+#define DEBUG_TYPE "iree-amdaie-canonicalize-npu-dma-cpy-nd"
+
+namespace mlir::iree_compiler::AMDAIE {
+
+class AMDAIECanonicalizeNpuDmaCpyNdPass
+    : public impl::AMDAIECanonicalizeNpuDmaCpyNdBase<
+          AMDAIECanonicalizeNpuDmaCpyNdPass> {
+ public:
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AMDAIEDialect>();
+  }
+
+  AMDAIECanonicalizeNpuDmaCpyNdPass() = default;
+  AMDAIECanonicalizeNpuDmaCpyNdPass(
+      const AMDAIECanonicalizeNpuDmaCpyNdPass &){};
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    ModuleOp moduleOp = getOperation();
+    IRRewriter rewriter(context);
+    Attribute zero = rewriter.getIndexAttr(0);
+    Attribute one = rewriter.getIndexAttr(1);
+
+    WalkResult walkResult = moduleOp->walk([&](NpuDmaCpyNdOp dmaOp) {
+      SmallVector<OpFoldResult> srcOffsets = dmaOp.getSourceMixedOffsets();
+      SmallVector<OpFoldResult> srcSizes = dmaOp.getSourceMixedSizes();
+      SmallVector<OpFoldResult> srcStrides = dmaOp.getSourceMixedStrides();
+
+      SmallVector<OpFoldResult> tgtOffsets = dmaOp.getTargetMixedOffsets();
+      SmallVector<OpFoldResult> tgtSizes = dmaOp.getTargetMixedSizes();
+      SmallVector<OpFoldResult> tgtStrides = dmaOp.getTargetMixedStrides();
+
+      bool allValidRanks = srcOffsets.size() <= nbDimensions &&
+                           srcSizes.size() <= nbDimensions &&
+                           srcStrides.size() <= nbDimensions &&
+                           tgtOffsets.size() <= nbDimensions &&
+                           tgtSizes.size() <= nbDimensions &&
+                           tgtStrides.size() <= nbDimensions;
+      if (!allValidRanks) {
+        dmaOp.emitOpError() << " has offsets/sizes/strides attributes that are "
+                               "larger than the target dimension of "
+                            << nbDimensions << ".";
+        return WalkResult::interrupt();
+      }
+
+      if (dmaOp.getSourceMemorySpaceAsUInt() == 0) {
+        if (!dmaOp.hasSourceAddressing()) {
+          dmaOp.emitOpError()
+              << "has source in L3, but does not have source addressing. "
+                 "Source addressing is required to canonicalize here.";
+          return WalkResult::interrupt();
+        }
+        srcOffsets = getPrepended(srcOffsets, zero);
+        srcSizes = getPrepended(srcSizes, one);
+        srcStrides = getPrepended(srcStrides, zero);
+        std::optional<uint32_t> maybeSwapIndex =
+            verifyAndGetZeroStrideIndex(srcSizes, srcStrides, dmaOp);
+        if (!maybeSwapIndex.has_value()) {
+          return WalkResult::interrupt();
+        }
+        uint32_t swapIndex = maybeSwapIndex.value();
+        bubble(srcOffsets, swapIndex);
+        bubble(srcSizes, swapIndex);
+        bubble(srcStrides, swapIndex);
+      }
+
+      if (dmaOp.getTargetMemorySpaceAsUInt() == 0) {
+        if (!dmaOp.hasTargetAddressing()) {
+          dmaOp.emitOpError()
+              << "has target in L3, but does not have target addressing. "
+                 "Target addressing is required to canonicalize here.";
+          return WalkResult::interrupt();
+        }
+        tgtOffsets = getPrepended(tgtOffsets, zero);
+        tgtSizes = getPrepended(tgtSizes, one);
+        tgtStrides = getPrepended(tgtStrides, zero);
+        std::optional<uint32_t> maybeSwapIndex =
+            verifyAndGetZeroStrideIndex(tgtSizes, tgtStrides, dmaOp);
+        if (!maybeSwapIndex.has_value()) {
+          return WalkResult::interrupt();
+        }
+        uint32_t swapIndex = maybeSwapIndex.value();
+        bubble(tgtOffsets, swapIndex);
+        bubble(tgtSizes, swapIndex);
+        bubble(tgtStrides, swapIndex);
+      }
+
+      rewriter.setInsertionPoint(dmaOp);
+
+      // Replace the npu.dma_cpy_nd with the canonicalized version.
+      dmaOp = rewriter.replaceOpWithNewOp<AMDAIE::NpuDmaCpyNdOp>(
+          dmaOp, dmaOp.getDma(), dmaOp.getTarget(), tgtOffsets, tgtSizes,
+          tgtStrides, dmaOp.getTargetBdId(), dmaOp.getSource(), srcOffsets,
+          srcSizes, srcStrides, dmaOp.getSourceBdId());
+
+      return WalkResult::advance();
+    });
+
+    if (walkResult.wasInterrupted()) {
+      return signalPassFailure();
+    }
+  }
+
+ private:
+  // Repeat prepend 'def' to 'tail' to make 'tail' have nbDimensions elements.
+  SmallVector<OpFoldResult> getPrepended(ArrayRef<OpFoldResult> tail,
+                                         Attribute def) {
+    assert(tail.size() <= nbDimensions);
+    SmallVector<OpFoldResult> res(nbDimensions, def);
+    std::copy(tail.begin(), tail.end(),
+              res.begin() + nbDimensions - tail.size());
+    return res;
+  }
+
+  static size_t getLowestIndexMaybeAboveOne(ArrayRef<OpFoldResult> v) {
+    for (size_t i = 0; i < v.size(); i++) {
+      std::optional<int64_t> maybe = getConstantIntValue(v[i]);
+      if (!maybe.has_value() || maybe.value() > 1) {
+        return i;
+      }
+    }
+    return v.size();
+  }
+
+  static size_t getHighestIndexMaybeZero(ArrayRef<OpFoldResult> v) {
+    for (size_t i = v.size(); i > 0; i--) {
+      std::optional<int64_t> maybe = getConstantIntValue(v[i - 1]);
+      if (!maybe.has_value() || maybe.value() == 0) {
+        return i - 1;
+      }
+    }
+    return 0;
+  }
+
+  /// Get the highest index where the stride is 0. If this index is greater
+  /// than the lowest index where the size is greater than 1, then fail.
+  std::optional<uint32_t> verifyAndGetZeroStrideIndex(
+      ArrayRef<OpFoldResult> sizes, ArrayRef<OpFoldResult> strides,
+      NpuDmaCpyNdOp dmaOp) {
+    assert(strides.size() == sizes.size() && strides.size() == nbDimensions);
+
+    size_t firstNonUnitDim = getLowestIndexMaybeAboveOne(sizes);
+    size_t lastZeroStrideDim = getHighestIndexMaybeZero(strides);
+
+    if (firstNonUnitDim < lastZeroStrideDim) {
+      // Limitation until AIE-4.
+      dmaOp.emitOpError("might have stride=0 in dimension ")
+          << lastZeroStrideDim << ", and size>1 in dimension "
+          << firstNonUnitDim << ". As " << firstNonUnitDim << " < "
+          << lastZeroStrideDim
+          << ", this cannot be supported -- the zero stride cannot be moved "
+             "to the outer-most (slowest) dimension, as required by current "
+             "AIE architecture.";
+      return {};
+    }
+    return lastZeroStrideDim;
+  }
+
+  // Example, for swapIndex = 2.
+  // Input
+  //                 [0 1 7 13]
+  // is mutated to
+  //                 [7 0 1 13]
+  static void bubble(MutableArrayRef<OpFoldResult> arr, size_t swapIndex) {
+    if (swapIndex > 0) {
+      std::rotate(arr.begin(), arr.begin() + swapIndex,
+                  arr.begin() + swapIndex + 1);
+    }
+  }
+};
+
+std::unique_ptr<Pass> createAMDAIECanonicalizeNpuDmaCpyNdPass() {
+  return std::make_unique<AMDAIECanonicalizeNpuDmaCpyNdPass>();
+}
+
+}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp
index 284b297c9..a317bce5d 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <memory>
 #include <numeric>
 
 #include "aie/AIEDialect.h"
@@ -19,10 +20,12 @@
 #include "iree-amd-aie/IR/AMDAIEOps.h"
 #include "iree-amd-aie/Transforms/AMDAIEUtils.h"
 #include "iree-amd-aie/Transforms/Passes.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Iterators.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Pass/PassManager.h"
 
 #define DEBUG_TYPE "iree-amdaie-lower-to-aie"
 
@@ -59,9 +62,6 @@ void eraseOp(IRRewriter &rewriter, IRMapping &mapper, Operation *op) {
 // Convert amdaie.core operation to aie.core
 //===----------------------------------------------------------------------===//
 
-namespace {
-
-
 /// Utility to convert vectors of `size` and `stride` into an
 /// `AIE::BDDimLayoutArrayAttr`.
 AIE::BDDimLayoutArrayAttr convertSizeStrideToBDDimLayoutArrayAttr(
@@ -190,22 +190,22 @@ LogicalResult accessOpToAIE(IRRewriter &rewriter,
               "`aie.objectfifo.acquire` + subview operation";
   }
 
-  memref::ReinterpretCastOp oldReinterpretOp;
+  SmallVector<memref::ReinterpretCastOp> oldReinterpretOps;
   for (Operation *user : accessOp->getUsers()) {
     if (isa<memref::ReinterpretCastOp>(user)) {
-      oldReinterpretOp = cast<memref::ReinterpretCastOp>(user);
-      break;
+      oldReinterpretOps.push_back(cast<memref::ReinterpretCastOp>(user));
     }
   }
-  if (!oldReinterpretOp) {
+  if (oldReinterpretOps.empty()) {
     return accessOp.emitError() << "reinterpret-cast op has not been generated";
   }
+  assert(oldReinterpretOps.size() == 1 &&
+         "expected a single reinterpret-cast op");
+  auto oldReinterpretOp = oldReinterpretOps[0];
 
   auto type = cast<MemRefType>(oldReinterpretOp.getResult().getType());
-
   MemRefType newType = MemRefType::Builder(type);
-
-  llvm::ArrayRef<int64_t> sizes = newType.getShape();
+  ArrayRef<int64_t> sizes = newType.getShape();
   auto [strides, baseOffset] = getStridesAndOffset(newType);
   auto reinterpretOp = rewriter.create<memref::ReinterpretCastOp>(
       rewriter.getUnknownLoc(), newType, subviewOp.getOutput(), baseOffset,
@@ -259,7 +259,7 @@ LogicalResult acquireOpToAIE(IRRewriter &rewriter,
 
   auto subviewOp = rewriter.create<AIE::ObjectFifoSubviewAccessOp>(
       rewriter.getUnknownLoc(), elementType, objFifoAquireOp.getSubview(),
-      rewriter.getIntegerAttr(rewriter.getI32Type(), 0));
+      /* index = */ rewriter.getIntegerAttr(rewriter.getI32Type(), 0));
 
   // Map acquire op to new acquire + subview op.
   mapper.map(acquireOp.getOperation(), subviewOp.getOperation());
@@ -268,17 +268,6 @@ LogicalResult acquireOpToAIE(IRRewriter &rewriter,
   return success();
 }
 
-LogicalResult coreLinalgOpToAIE(IRRewriter &rewriter, linalg::LinalgOp linalgOp,
-                                IRMapping &mapper,
-                                SmallVector<Operation *> &toBeErased) {
-  LLVM_DEBUG(llvm::dbgs() << "Convert [linalg.LinalgOp]\n");
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPoint(linalgOp);
-  rewriter.clone(*(linalgOp.getOperation()), mapper);
-  eraseOp(rewriter, mapper, linalgOp);
-  return success();
-}
-
 LogicalResult coreMemrefExtractStridedMetadataToAIE(
     IRRewriter &rewriter,
     memref::ExtractStridedMetadataOp extractStridedMetadataOp,
@@ -387,7 +376,7 @@ LogicalResult coreToAIE(IRRewriter &rewriter, AMDAIE::CoreOp coreOp,
   auto aieCoreOp =
       rewriter.create<AIE::CoreOp>(rewriter.getUnknownLoc(), tileOp);
   Region &aieCoreRegion = aieCoreOp.getBody();
-  auto aieCoreBlock = rewriter.createBlock(&aieCoreRegion);
+  Block *aieCoreBlock = rewriter.createBlock(&aieCoreRegion);
   auto insertIt = aieCoreBlock->begin();
   auto coreBlockBegin = coreBlock->begin();
   auto coreBlockEnd = coreBlock->getTerminator()->getIterator();
@@ -399,7 +388,7 @@ LogicalResult coreToAIE(IRRewriter &rewriter, AMDAIE::CoreOp coreOp,
   rewriter.create<AIE::EndOp>(rewriter.getUnknownLoc());
 
   SmallVector<Operation *> toBeErased;
-  auto walkResult = aieCoreOp.walk([&](Operation *op) {
+  WalkResult walkResult = aieCoreOp.walk([&](Operation *op) {
     rewriter.setInsertionPoint(op);
     if (TypeSwitch<Operation *, LogicalResult>(op)
             .Case<AMDAIE::LogicalObjectFifoAccessOp>([&](auto accessOp) {
@@ -412,9 +401,6 @@ LogicalResult coreToAIE(IRRewriter &rewriter, AMDAIE::CoreOp coreOp,
               return coreReleaseOpToAIE(rewriter, releaseOp, mapper,
                                         toBeErased);
             })
-            .Case<linalg::LinalgOp>([&](auto linalgOp) {
-              return coreLinalgOpToAIE(rewriter, linalgOp, mapper, toBeErased);
-            })
             .Case<memref::ExtractStridedMetadataOp>(
                 [&](auto extractStridedMetadataOp) {
                   return coreMemrefExtractStridedMetadataToAIE(
@@ -437,9 +423,7 @@ LogicalResult coreToAIE(IRRewriter &rewriter, AMDAIE::CoreOp coreOp,
     coreOp.emitError("could not convert to AIEDialect ops");
     return failure();
   }
-  for (auto *op : toBeErased) {
-    eraseOp(rewriter, mapper, op);
-  }
+  for (Operation *op : toBeErased) eraseOp(rewriter, mapper, op);
 
   mapper.map(coreOp.getResult(), aieCoreOp.getResult());
   mapper.map(coreOp.getOperation(), aieCoreOp.getOperation());
@@ -460,11 +444,13 @@ LogicalResult circularDmaToAIE(IRRewriter &rewriter,
                                int &dmaId) {
   LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::CircularDmaCpyNdOp]\n");
   rewriter.setInsertionPointToEnd(deviceBlock);
+
   if (!dmaOp.getSource()) return dmaOp.emitOpError() << "expected a source";
   auto sourceLogicalObjFifo = dyn_cast<AMDAIE::LogicalObjFifoOpInterface>(
       dmaOp.getSource().getDefiningOp());
   if (!sourceLogicalObjFifo)
     return dmaOp.emitOpError() << "expected a logical objectFifo source";
+
   SmallVector<Value> newSourceTiles =
       llvm::map_to_vector(sourceLogicalObjFifo.getTiles(),
                           [&](Value tile) { return mapper.lookup(tile); });
@@ -480,12 +466,13 @@ LogicalResult circularDmaToAIE(IRRewriter &rewriter,
       dmaOp.getTarget().getDefiningOp());
   if (!targetLogicalObjFifo)
     return dmaOp.emitOpError() << "expected a logical objectFifo source";
+
   SmallVector<Value> newTargetTiles =
       llvm::map_to_vector(targetLogicalObjFifo.getTiles(),
                           [&](Value tile) { return mapper.lookup(tile); });
 
   auto symName = "obj" + std::to_string(dmaId++);
-  auto symAttr = rewriter.getStringAttr(symName);
+  StringAttr symAttr = rewriter.getStringAttr(symName);
   FailureOr<AIE::ObjectFifoCreateOp> objFifo =
       createObjectFifo(rewriter, dmaOp, newSourceTile, newTargetTiles, symAttr);
   if (failed(objFifo)) return failure();
@@ -497,175 +484,93 @@ LogicalResult circularDmaToAIE(IRRewriter &rewriter,
 // Convert amdaie.controlcode operation to NPU instruction func
 //===----------------------------------------------------------------------===//
 
-namespace {
-
-/// Utility to get the static offsets, sizes and strides for
-/// `AIEX::NpuDmaMemcpyNdOp` with explicit addressing.
-LogicalResult getStaticDimsForExplicitAddressing(
-    Operation *op, const SmallVector<OpFoldResult> &offsets,
-    const SmallVector<OpFoldResult> &sizes,
-    const SmallVector<OpFoldResult> &strides,
-    SmallVectorImpl<int64_t> &staticOffsets,
-    SmallVectorImpl<int64_t> &staticSizes,
-    SmallVectorImpl<int64_t> &staticStrides) {
-  if (offsets.size() > staticOffsets.size()) {
-    return op->emitError() << "size of `offsets` should be smaller or equal to "
-                              "size of `staticOffsets`";
-  }
-  if (sizes.size() > staticSizes.size()) {
-    return op->emitError() << "size of `sizes` should be smaller or equal to "
-                              "size of `staticSizes`";
-  }
-  if (strides.size() > staticStrides.size()) {
-    return op->emitError() << "size of `strides` should be smaller or equal to "
-                              "size of `staticStrides`";
-  }
-  if (getConstantIntValue(strides[strides.size() - 1]).value() != 1) {
-    return op->emitError() << "invalid last stride, should be 1";
-  }
-  for (int i = 0; i < offsets.size(); ++i)
-    staticOffsets[staticOffsets.size() - offsets.size() + i] =
-        getConstantIntValue(offsets[i]).value();
-  for (int i = 0; i < sizes.size(); ++i)
-    staticSizes[staticSizes.size() - sizes.size() + i] =
-        getConstantIntValue(sizes[i]).value();
-  for (int i = 0; i < strides.size(); ++i)
-    staticStrides[staticStrides.size() - strides.size() + i] =
-        getConstantIntValue(strides[i]).value();
-  return success();
-}
-
-/// Utility to move 'repeat dimension' with stride 0 and size > 1 to outermost
-/// dimension as only that one can support a stride with value 0 in AIE2(+)
-/// hardware. But first check that such a dimension is actually the first 'real
-/// dimension' in the access pattern.
-LogicalResult canonicalizeNpuStridedPatternForAIE(
-    SmallVectorImpl<int64_t> &offsets, SmallVectorImpl<int64_t> &sizes,
-    SmallVectorImpl<int64_t> &strides) {
-  bool foundNonUnitDim{false};
-  for (size_t i = 0; i < offsets.size(); i++) {
-    if (strides[i] == 0 && sizes[i] == 1) {
-      continue;
-    } else if (strides[i] == 0) {
-      assert(sizes[i] > 0 && "size should be positive");
-      if (foundNonUnitDim) return failure();
-      foundNonUnitDim = true;
-    } else {
-      foundNonUnitDim = true;
-    }
-  }
-  // Either dim 0 is a 'repeat dimension' or if the repeat is on a different
-  // dimension, it guaranteed to be preceded by unit dimensions based on the
-  // former check.
-  for (size_t i = 1; i < offsets.size(); i++) {
-    if (strides[i] == 0 && sizes[i] > 1) {
-      strides[0] = 0;
-      sizes[0] = sizes[i];
-      sizes[i] = 1;
-    }
-  }
-  return success();
-}
-
 /// Convert the `amdaie.npu.dma_cpy_nd` operation to `aiex.npu.dma_memcpy_nd`.
 LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter,
                                  AMDAIE::NpuDmaCpyNdOp dmaOp,
                                  SmallVector<Operation *> &toBeErased,
                                  IRMapping &mapper, IRMapping &bindingsMapper) {
-  rewriter.setInsertionPoint(dmaOp);
+  AMDAIE::CircularDmaCpyNdOp dmaCpyNd = dmaOp.getDmaCpyNdOp();
+
+  SmallVector<Value> offsets, sizes, strides;
+  ArrayRef<int64_t> staticOffsets, staticSizes, staticStrides;
+  AMDAIE::BdIdOp bdIdOp;
+  LogicalObjectFifoFromMemrefOp logicalObjFifo;
+
   // Convert bidirectional `amdaie.npu.dma_cpy_nd` op into two halves.
   if (dmaOp.getSource()) {
-    auto sourceLogicalObjFifo = dyn_cast<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+    offsets = dmaOp.getSourceOffsets();
+    sizes = dmaOp.getSourceSizes();
+    strides = dmaOp.getSourceStrides();
+    staticOffsets = dmaOp.getSourceStaticOffsets();
+    staticSizes = dmaOp.getSourceStaticSizes();
+    staticStrides = dmaOp.getSourceStaticStrides();
+    bdIdOp = dmaOp.getSourceBdIdOp();
+    if (!bdIdOp) {
+      return dmaOp.emitOpError()
+             << "must have a source BD ID op to lower to the AIE dialect.";
+    }
+    logicalObjFifo = dyn_cast<AMDAIE::LogicalObjectFifoFromMemrefOp>(
         dmaOp.getSource().getDefiningOp());
-    if (!sourceLogicalObjFifo) {
+    if (!logicalObjFifo) {
       return dmaOp.emitOpError() << "expected source to be an "
                                     "`amdaie.logicalobjectfifo.from_memref`";
     }
-    if (!dmaOp.hasSourceAddressing()) {
-      return dmaOp.emitOpError()
-             << "expected source addressing for DMA with source on L3";
-    }
-    AMDAIE::BdIdOp bdIdOp = dmaOp.getSourceBdIdOp();
-    if (!bdIdOp)
-      return dmaOp.emitOpError() << "expected to have a source BD ID op";
-
-    // DmaOp either has explicit source addressing OR the defining op of its
-    // source has its source on L3.
-    SmallVector<Value> empty;
-    SmallVector<int64_t, 4> staticOffsets(4, 0);
-    SmallVector<int64_t, 4> staticSizes(4, 1);
-    SmallVector<int64_t, 4> staticStrides(4, 0);
-    if (failed(getStaticDimsForExplicitAddressing(
-            dmaOp, dmaOp.getSourceMixedOffsets(), dmaOp.getSourceMixedSizes(),
-            dmaOp.getSourceMixedStrides(), staticOffsets, staticSizes,
-            staticStrides))) {
-      return failure();
-    }
-    if (failed(canonicalizeNpuStridedPatternForAIE(staticOffsets, staticSizes,
-                                                   staticStrides))) {
-      return dmaOp.emitError() << "could not canonicalize for AIE";
-    }
+  }
 
-    AMDAIE::CircularDmaCpyNdOp dmaCpyNd = dmaOp.getDmaCpyNdOp();
-    Value memref = bindingsMapper.lookup(sourceLogicalObjFifo.getMemref());
-    auto objFifo = dyn_cast<xilinx::AIE::ObjectFifoCreateOp>(
-        mapper.lookup(dmaCpyNd.getOperation()));
-    if (!objFifo) {
-      return dmaOp.emitError()
-             << "input isn't mapped to an `aie.objectifo` operation";
+  else if (dmaOp.getTarget()) {
+    offsets = dmaOp.getTargetOffsets();
+    sizes = dmaOp.getTargetSizes();
+    strides = dmaOp.getTargetStrides();
+    staticOffsets = dmaOp.getTargetStaticOffsets();
+    staticSizes = dmaOp.getTargetStaticSizes();
+    staticStrides = dmaOp.getTargetStaticStrides();
+    bdIdOp = dmaOp.getTargetBdIdOp();
+    if (!bdIdOp) {
+      return dmaOp.emitOpError()
+             << "must have a target BD ID op to lower to the AIE dialect.";
     }
-    bool issueToken = dmaOp.hasDmaWaitOpUser();
-    rewriter.create<AIEX::NpuDmaMemcpyNdOp>(
-        rewriter.getUnknownLoc(), SmallVector<Type, 1>{}, 0, 0, memref, empty,
-        empty, empty, staticOffsets, staticSizes, staticStrides,
-        objFifo.getName(), bdIdOp.getValue(), issueToken);
-  }
-  if (dmaOp.getTarget()) {
-    auto targetLogicalObjFifo = dyn_cast<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+    logicalObjFifo = dyn_cast<AMDAIE::LogicalObjectFifoFromMemrefOp>(
         dmaOp.getTarget().getDefiningOp());
-    if (!targetLogicalObjFifo) {
+    if (!logicalObjFifo) {
       return dmaOp.emitOpError() << "expected target to be an "
                                     "`amdaie.logicalobjectfifo.from_memref`";
     }
-    if (!dmaOp.hasTargetAddressing()) {
-      return dmaOp.emitOpError()
-             << "expected target addressing for DMA with target on L3";
-    }
-    AMDAIE::BdIdOp bdIdOp = dmaOp.getTargetBdIdOp();
-    if (!bdIdOp)
-      return dmaOp.emitOpError() << "expected to have a target BD ID op";
-
-    // DmaOp either has explicit target addressing OR the defining op of its
-    // source has its target on L3.
-    SmallVector<Value> empty;
-    SmallVector<int64_t, 4> staticOffsets(4, 0);
-    SmallVector<int64_t, 4> staticSizes(4, 1);
-    SmallVector<int64_t, 4> staticStrides(4, 0);
-    if (failed(getStaticDimsForExplicitAddressing(
-            dmaOp, dmaOp.getTargetMixedOffsets(), dmaOp.getTargetMixedSizes(),
-            dmaOp.getTargetMixedStrides(), staticOffsets, staticSizes,
-            staticStrides))) {
-      return failure();
-    }
-    if (failed(canonicalizeNpuStridedPatternForAIE(staticOffsets, staticSizes,
-                                                   staticStrides))) {
-      return dmaOp.emitError() << "could not canonicalize for AIE";
-    }
+  }
 
-    AMDAIE::CircularDmaCpyNdOp dmaCpyNd = dmaOp.getDmaCpyNdOp();
-    Value memref = bindingsMapper.lookup(targetLogicalObjFifo.getMemref());
-    auto objFifo = dyn_cast<xilinx::AIE::ObjectFifoCreateOp>(
-        mapper.lookup(dmaCpyNd.getOperation()));
-    if (!objFifo) {
-      return dmaOp.emitError()
-             << "input isn't mapped to an `aie.objectifo` operation";
-    }
-    bool issueToken = dmaOp.hasDmaWaitOpUser();
-    rewriter.create<AIEX::NpuDmaMemcpyNdOp>(
-        rewriter.getUnknownLoc(), SmallVector<Type, 1>{}, 0, 0, memref, empty,
-        empty, empty, staticOffsets, staticSizes, staticStrides,
-        objFifo.getName(), bdIdOp.getValue(), issueToken);
+  else {
+    return dmaOp.emitOpError()
+           << "has neither source not target memory space as L3.";
+  }
+
+  Value memref = bindingsMapper.lookup(logicalObjFifo.getMemref());
+
+  auto objFifo =
+      dyn_cast<AIE::ObjectFifoCreateOp>(mapper.lookup(dmaCpyNd.getOperation()));
+
+  uint32_t bdId = bdIdOp.getValue();
+
+  if (!objFifo)
+    return dmaOp.emitError()
+           << "input isn't mapped to an `aie.objectifo` operation";
+
+  if (!offsets.empty() || !sizes.empty() || !strides.empty()) {
+    // Not doing now as better to just eliminate use of aiex dialect
+    // altogether.
+    return dmaOp.emitError()
+           << "Expect all source offsets, sizes, and strides to be static at "
+              "this point. Dynamic values can be supported, just need to "
+              "cast from 'index' to 64-bit signless integer for "
+              "aiex.npu.dma_memcpy_nd.";
   }
+
+  bool issueToken = dmaOp.hasDmaWaitOpUser();
+
+  rewriter.setInsertionPoint(dmaOp);
+  rewriter.create<AIEX::NpuDmaMemcpyNdOp>(
+      dmaOp.getLoc(), SmallVector<Type, 1>{}, 0, 0, memref, offsets, sizes,
+      strides, staticOffsets, staticSizes, staticStrides, objFifo.getName(),
+      bdId, issueToken);
+
   toBeErased.push_back(dmaOp);
   return success();
 }
@@ -690,8 +595,8 @@ LogicalResult npuDmaWaitToAIE(IRRewriter &rewriter, AMDAIE::NpuDmaWaitOp waitOp,
 
 /// Insert the control code operations into the NPU instruction function.
 LogicalResult controlCodeToAie(IRRewriter &rewriter,
-                               AMDAIE::ControlCodeOp &controlCodeOp,
-                               xilinx::AIEX::RuntimeSequenceOp &funcOp,
+                               AMDAIE::ControlCodeOp controlCodeOp,
+                               xilinx::AIEX::RuntimeSequenceOp funcOp,
                                IRMapping &mapper, IRMapping &bindingsMapper) {
   LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::ControlCodeOp]\n");
   Block *funcBlock = &funcOp.getBody().front();
@@ -736,14 +641,10 @@ LogicalResult controlCodeToAie(IRRewriter &rewriter,
         return WalkResult::advance();
       });
   if (res.wasInterrupted()) return failure();
-  for (auto *op : toBeErased) {
-    eraseOp(rewriter, mapper, op);
-  }
+  for (Operation *op : toBeErased) eraseOp(rewriter, mapper, op);
   return success();
 }
 
-}  // namespace
-
 //===----------------------------------------------------------------------===//
 // Convert amdaie.logicalobjectfifo.link operation to `aie.objectfifo.link`
 //===----------------------------------------------------------------------===//
@@ -898,16 +799,19 @@ LogicalResult lowerToAIE(ModuleOp moduleOp) {
     if (funcOp.isPrivate()) {
       return WalkResult::advance();
     }
-    // Insert AIE DeviceOp
+
+    // Create aie.device.
     rewriter.setInsertionPoint(moduleBlock, moduleBlock->begin());
     auto deviceOp = rewriter.create<xilinx::AIE::DeviceOp>(
         rewriter.getUnknownLoc(),
         xilinx::AIE::AIEDeviceAttr::get(rewriter.getContext(), aieDevice));
-    deviceOp.getRegion().emplaceBlock();
-    Block *deviceBlock = &deviceOp.getRegion().front();
+    Block *deviceBlock = &deviceOp.getRegion().emplaceBlock();
 
-    // Create the signature of the NPU instruction sequence function. The HAL
-    // interface bindings are used to order the function parameters correctly.
+    // The amdaie.controlcode operation has no operands, but the
+    // aiex.runtime_sequence that it lowers to, does. Create the signature
+    // of the aiex.runtime_sequence operation that replaces the
+    // amdaie.controlcode. The HAL interface bindings are used to
+    // order the function parameters correctly.
     IRMapping bindingsMapper;
     SmallVector<IREE::HAL::InterfaceBindingSubspanOp> subspanOps;
     funcOp->walk([&](IREE::HAL::InterfaceBindingSubspanOp subspanOp) {
@@ -918,13 +822,16 @@ LogicalResult lowerToAIE(ModuleOp moduleOp) {
       return a.getBinding().getZExtValue() < b.getBinding().getZExtValue();
     });
     rewriter.setInsertionPoint(deviceBlock, deviceBlock->begin());
+
+    // Create aiex.runtime_sequence inside aie.device
     auto npuFuncOp = rewriter.create<xilinx::AIEX::RuntimeSequenceOp>(
         rewriter.getUnknownLoc(), rewriter.getStringAttr(funcOp.getSymName()));
-    npuFuncOp.getBody().push_back(new Block);
-    for (int i = 0, e = subspanOps.size(); i < e; i++) {
-      auto a = subspanOps[i].getResult();
-      npuFuncOp.getBody().addArgument(a.getType(), a.getLoc());
-      bindingsMapper.map(a, npuFuncOp.getBody().getArgument(i));
+    Region &body = npuFuncOp.getBody();
+    body.emplaceBlock();
+
+    for (auto &&a : llvm::enumerate(subspanOps)) {
+      body.addArgument(a.value().getType(), a.value().getLoc());
+      bindingsMapper.map(a.value(), body.getArgument(a.index()));
     }
 
     // Walk the AIE regions ops and convert ops into pure AIEDialect ops.
@@ -934,6 +841,7 @@ LogicalResult lowerToAIE(ModuleOp moduleOp) {
       if (isa<func::FuncOp, func::ReturnOp>(op)) {
         return WalkResult::advance();
       } else if (auto workgroupOp = dyn_cast<AMDAIE::WorkgroupOp>(op)) {
+        // TODO(newling) should be 1 device op per workgroup op, surely?
         if (failed(workgroupToAIE(rewriter, workgroupOp, deviceOp, npuFuncOp,
                                   mapper, bindingsMapper))) {
           return WalkResult::interrupt();
@@ -957,12 +865,16 @@ LogicalResult lowerToAIE(ModuleOp moduleOp) {
     return WalkResult::advance();
   });
   if (funcRes.wasInterrupted()) return failure();
-  return success();
-}
 
-/// Utility to erase all HAL bindings and dependent operations.
-LogicalResult eraseHALBindings(ModuleOp moduleOp) {
-  IRRewriter rewriter(moduleOp.getContext());
+  // All Ukernel related function declarations will be within aie.device, so
+  // delete the ones outside from the SymbolTable.
+  SymbolTable symbolTable(moduleOp);
+  moduleOp->walk([&](func::FuncOp funcOp) {
+    if (funcOp.isPrivate() && !funcOp->getParentOfType<AIE::DeviceOp>()) {
+      symbolTable.erase(funcOp);
+    }
+  });
+
   SmallVector<Operation *> opsToBeErased;
   moduleOp.walk([&](IREE::HAL::InterfaceBindingSubspanOp subspanOp) {
     opsToBeErased.push_back(subspanOp.getOperation());
@@ -980,49 +892,6 @@ LogicalResult eraseHALBindings(ModuleOp moduleOp) {
   return success();
 }
 
-/// Utility to move dependencies outside an operation into that operation. This
-/// is for example needed for `aie.core` operations as MLIR-AIE expects all
-/// dependencies, like constants, inside those core operations.
-template <typename OpTy>
-class MoveAllDependenciesIntoOp : public OpRewritePattern<OpTy> {
-  using OpRewritePattern<OpTy>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(OpTy parentOp,
-                                PatternRewriter &rewriter) const override {
-    bool addedDependency = false;
-    parentOp->walk([&](Operation *op) {
-      // Skip operations of type 'OpTy'.
-      if (isa<OpTy>(op)) {
-        return WalkResult::advance();
-      }
-      // Check all operands and whether their defining operations are located
-      // outside the parentOp.
-      for (Value operand : op->getOperands()) {
-        if (!operand || !operand.getDefiningOp()) {
-          continue;
-        }
-        Operation *dependencyOp = operand.getDefiningOp();
-        if (isa_and_nonnull<xilinx::AIE::AIEDialect, xilinx::AIEX::AIEXDialect>(
-                op->getDialect())) {
-          // Skip AIE dialect operations.
-          continue;
-        } else if (!dependencyOp->getParentOfType<OpTy>()) {
-          // Clone the dependency operation into the parent operation's block
-          // and replace all uses.
-          rewriter.setInsertionPointToStart(&parentOp->getRegion(0).front());
-          Operation *newOp = rewriter.clone(*dependencyOp);
-          dependencyOp->replaceUsesWithIf(newOp, [&](OpOperand &use) {
-            return use.getOwner()->getParentOfType<OpTy>() == parentOp;
-          });
-          addedDependency = true;
-        }
-      }
-      return WalkResult::advance();
-    });
-    return success(addedDependency);
-  }
-};
-
 class AMDAIELowerToAIEPass
     : public impl::AMDAIELowerToAIEBase<AMDAIELowerToAIEPass> {
  public:
@@ -1033,44 +902,12 @@ class AMDAIELowerToAIEPass
 
   AMDAIELowerToAIEPass() = default;
   AMDAIELowerToAIEPass(const AMDAIELowerToAIEPass &pass){};
-  void runOnOperation() override;
-};
-
-void AMDAIELowerToAIEPass::runOnOperation() {
-  // Main function call to convert all operations into AIE dialect operations
-  // inside an AIE device.
-  if (failed(lowerToAIE(getOperation()))) {
-    return signalPassFailure();
-  }
-  LLVM_DEBUG(llvm::dbgs() << "Module after lowerToAIE: " << getOperation());
-
-  // Clean up the HAL bindings and it's uses as they are not needed anymore.
-  if (failed(eraseHALBindings(getOperation()))) {
-    return signalPassFailure();
-  }
-
-  // Move all dependencies, like for example constants, that are residing
-  // outside core operations into those core operations. This is required by
-  // the AIE dialect.
-  MLIRContext *context = &getContext();
-  RewritePatternSet patterns(context);
-  patterns.insert<MoveAllDependenciesIntoOp<xilinx::AIE::CoreOp>>(context);
-  if (failed(
-          applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)))) {
-    return signalPassFailure();
+  void runOnOperation() override {
+    // Main function call to convert all operations into AIE dialect
+    // operations inside an AIE device.
+    if (failed(lowerToAIE(getOperation()))) return signalPassFailure();
   }
-
-  // All Ukernel related function declarations will be within aie.device, so
-  // delete the ones outside from the SymbolTable.
-  SymbolTable symbolTable(getOperation());
-  getOperation()->walk([&](func::FuncOp funcOp) {
-    if (funcOp.isPrivate() && !funcOp->getParentOfType<AIE::DeviceOp>()) {
-      symbolTable.erase(funcOp);
-    }
-  });
-}
-
-}  // namespace
+};
 
 std::unique_ptr<Pass> createAMDAIELowerToAIEPass() {
   return std::make_unique<AMDAIELowerToAIEPass>();
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESinkIntoCore.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESinkIntoCore.cpp
new file mode 100644
index 000000000..ef62e800f
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESinkIntoCore.cpp
@@ -0,0 +1,127 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "aie/AIEDialect.h"
+#include "iree-amd-aie/IR/AMDAIEDialect.h"
+#include "iree-amd-aie/IR/AMDAIEOps.h"
+#include "iree-amd-aie/Transforms/Passes.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Pass/Pass.h"
+
+#define DEBUG_TYPE "iree-amdaie-sink-into-core"
+
+namespace mlir::iree_compiler::AMDAIE {
+
+namespace {
+
+bool sinkInto(AMDAIE::CoreOp coreOp, IRRewriter &rewriter) {
+  bool changed = false;
+
+  // Collect all ops in the amdaie.core op
+  SmallVector<Operation *> opsInCore;
+  coreOp->walk([&](Operation *op) {
+    if (op == coreOp) return WalkResult::advance();
+    opsInCore.push_back(op);
+    return WalkResult::advance();
+  });
+
+  for (auto opInCore : opsInCore) {
+    for (Value operand : opInCore->getOperands()) {
+      if (!operand || !operand.getDefiningOp()) continue;
+      Operation *dependencyOp = operand.getDefiningOp();
+
+      // Skip if the dependency is already in the core.
+      if (coreOp->isAncestor(dependencyOp)) {
+        continue;
+      }
+
+      // Ops in the amdaie dialect are probably related to data movement
+      // and should not be sunk into the core. This might need adjustment
+      // later.
+      if (dependencyOp->getDialect()->getNamespace() ==
+          AMDAIE::AMDAIEDialect::getDialectNamespace()) {
+        continue;
+      }
+
+      // Create a clone of the dependency op in the core region.
+      Region &r = coreOp->getRegion(0);
+      assert(r.getBlocks().size() == 1 && "expected single block region");
+      rewriter.setInsertionPointToStart(&r.front());
+      Operation *sunkOp = rewriter.clone(*dependencyOp);
+
+      // Replace uses of the dependency op inside the core.
+      dependencyOp->replaceUsesWithIf(sunkOp, [&](OpOperand &use) {
+        return coreOp->isAncestor(use.getOwner());
+      });
+
+      // If the dependency op has no uses left, erase it. Note: relying
+      // on canonicalization to do this is risky, because sometimes constants
+      // are cse'd and the original constant which we've been trying to
+      // sink is the only one retained (undoing all of our work).
+      //
+      // Ideally we would have some sort of formal isolation of amdaie.core ops
+      // to prevent this.
+      //
+      // This is still not a 100% robust solution, as there might be a constant
+      // which we've sunk into core ops that still has a use outside of a core
+      // op and the we end up slipping down the same slope to square one.
+      if (dependencyOp->getUses().empty()) {
+        rewriter.eraseOp(dependencyOp);
+      }
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+class AMDAIESinkIntoCorePass
+    : public impl::AMDAIESinkIntoCoreBase<AMDAIESinkIntoCorePass> {
+ public:
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<tensor::TensorDialect, linalg::LinalgDialect,
+                    AMDAIE::AMDAIEDialect, xilinx::AIE::AIEDialect>();
+  }
+
+  AMDAIESinkIntoCorePass() = default;
+  AMDAIESinkIntoCorePass(const AMDAIESinkIntoCorePass &pass) = default;
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    IRRewriter rewriter(context);
+    ModuleOp moduleOp = getOperation();
+    SmallVector<AMDAIE::CoreOp> coreOps;
+    moduleOp.walk([&](AMDAIE::CoreOp coreOp) { coreOps.push_back(coreOp); });
+
+    // We do our own fixed point convergence, because we don't want any
+    // canonicalization to happen, which cannot be avoided using
+    // PatternRewriter. Canonicalization can undo the work we've done
+    // sinking ops into the core.
+    for (auto coreOp : coreOps) {
+      bool changed = true;
+      uint32_t iteration = 0;
+      while (changed && iteration < maxIterations) {
+        changed = sinkInto(coreOp, rewriter);
+        ++iteration;
+      }
+      if (changed) {
+        coreOp->emitError("failed to converge in ")
+            << maxIterations << " iterations.";
+      }
+    }
+  }
+
+ private:
+  uint64_t maxIterations = 100;
+};
+
+}  // namespace
+
+std::unique_ptr<Pass> createAMDAIESinkIntoCorePass() {
+  return std::make_unique<AMDAIESinkIntoCorePass>();
+}
+
+}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
index 938171a48..ac6e86fce 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
@@ -50,6 +50,7 @@ iree_cc_library(
     "AMDAIEAssignNpuDmaBdIds.cpp"
     "AMDAIEBufferizeToAllocation.cpp"
     "AMDAIECanonicalizeDma.cpp"
+    "AMDAIECanonicalizeNpuDmaCpyNd.cpp"
     "AMDAIECanonicalizeDoublyStridedOp.cpp"
     "AMDAIECombineStridedOps.cpp"
     "AMDAIEControlCodeLoopUnroll.cpp"
@@ -85,6 +86,7 @@ iree_cc_library(
     "AMDAIEPad.cpp"
     "AMDAIEPeelForLoop.cpp"
     "AMDAIEPropagateDataLayout.cpp"
+    "AMDAIESinkIntoCore.cpp"
     "AMDAIETile.cpp"
     "AMDAIETileAndFuse.cpp"
     "AMDAIEUtils.cpp"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
index 5bd44c2e7..abc75e0f4 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
@@ -28,6 +28,7 @@ namespace mlir::iree_compiler::AMDAIE {
 #define GEN_PASS_DEF_AMDAIEBUFFERIZETOALLOCATION
 #define GEN_PASS_DEF_AMDAIECANONICALIZEDMA
 #define GEN_PASS_DEF_AMDAIECANONICALIZEDOUBLYSTRIDEDOP
+#define GEN_PASS_DEF_AMDAIECANONICALIZENPUDMACPYND
 #define GEN_PASS_DEF_AMDAIECLEANUP
 #define GEN_PASS_DEF_AMDAIECOMBINESTRIDEDOPS
 #define GEN_PASS_DEF_AMDAIECONTROLCODELOOPUNROLL
@@ -63,11 +64,12 @@ namespace mlir::iree_compiler::AMDAIE {
 #define GEN_PASS_DEF_AMDAIEPACKANDTRANSPOSE
 #define GEN_PASS_DEF_AMDAIEPACKTODMA
 #define GEN_PASS_DEF_AMDAIEPAD
-#define GEN_PASS_DEF_AMDAIEVECTORIZATION
 #define GEN_PASS_DEF_AMDAIEPEELFORLOOP
 #define GEN_PASS_DEF_AMDAIEPROPAGATEDATALAYOUT
+#define GEN_PASS_DEF_AMDAIESINKINTOCORE
 #define GEN_PASS_DEF_AMDAIETILE
 #define GEN_PASS_DEF_AMDAIETILEANDFUSE
+#define GEN_PASS_DEF_AMDAIEVECTORIZATION
 #include "iree-amd-aie/Transforms/Passes.h.inc"
 
 }  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
index 81c54b413..cb3d87425 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -8,12 +8,20 @@
 
 #include "aie/Passes.h"
 #include "aievec/Passes.h"
-#include "air/Conversion/Passes.h"
-#include "air/Transform/Passes.h"
+#include "air/Conversion/AIRLoweringPass.h"
+#include "air/Conversion/AIRRtToNpuPass.h"
+#include "air/Conversion/AIRToAIEPass.h"
+#include "air/Conversion/ConvertToAIRPass.h"
+#include "air/Transform/AIRDependency.h"
+#include "air/Transform/AIRDependencyCanonicalize.h"
+#include "air/Transform/AIRDependencyScheduleOpt.h"
+#include "air/Transform/AIRDmaToChannel.h"
+#include "air/Transform/AIRHerdPlacementPass.h"
+#include "air/Transform/AIRMiscPasses.h"
+#include "air/Transform/AffineLoopOptPass.h"
 #include "iree-amd-aie/IR/AMDAIEAttrs.h"
 #include "iree-dialects/Dialect/LinalgTransform/Passes.h"
 #include "iree/compiler/Codegen/Common/Passes.h"
-#include "iree/compiler/Utils/PassUtils.h"
 #include "iree/compiler/Utils/ToolUtils.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
@@ -137,6 +145,15 @@ static void addAMDAIEBufferizePasses(OpPassManager &pm) {
   addIREEComprehensiveBufferizePasses(pm, allocationFn, memCpyFn);
 }
 
+void addAMDAIEToAIEPasses(OpPassManager &passManager) {
+  passManager.addPass(createAMDAIECanonicalizeNpuDmaCpyNdPass());
+  passManager.addPass(createCanonicalizerPass());
+  passManager.addPass(createAMDAIESinkIntoCorePass());
+  passManager.addPass(createCanonicalizerPass());
+  passManager.addPass(createAMDAIELowerToAIEPass());
+  passManager.addPass(createCanonicalizerPass());
+}
+
 void addPackPeelBasedPassPipeline(OpPassManager &funcPassManager,
                                   TilingConfig &tilingConfig) {
   // First level tiling using scf.forall
@@ -619,7 +636,9 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) {
   passManager.addPass(createAMDAIEConvertCoreForallToForPass());
   passManager.addPass(createCanonicalizerPass());
   passManager.addPass(createAMDAIECoreLoopUnrollPass());
-  passManager.addPass(createAMDAIELowerToAIEPass());
+
+  addAMDAIEToAIEPasses(passManager);
+
   passManager.addPass(createCanonicalizerPass());
 
   // Now lower using the AIE passes from MLIR-AIE.
@@ -634,6 +653,7 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device) {
   passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass());
   passManager.addPass(memref::createFoldMemRefAliasOpsPass());
   passManager.addPass(createAMDAIEBridgeToAIRPass());
+
   // TODO (Erwei): Figure out a way to work with AMDAIEPackToDmaPass.
   if (clUseTilePipeline == TilePassPipeline::PackPeelPipeline)
     passManager.addPass(createAMDAIEDecomposeLinalgExtPackUnPackToAIRPass());
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
index e01890f90..995ef04b2 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
@@ -77,6 +77,9 @@ std::unique_ptr<Pass> createAMDAIEBufferizeToAllocationPass(
 /// Create pass to apply canonicalization to air.dma_memcpy_nd op's.
 std::unique_ptr<Pass> createAMDAIECanonicalizeDmaPass();
 
+/// Create pass to canonicalize `amdaie.npu.dma_cpy_nd` operations.  
+std::unique_ptr<Pass> createAMDAIECanonicalizeNpuDmaCpyNdPass();
+
 /// Create pass to canonicalize doubly strided operations.
 std::unique_ptr<Pass> createAMDAIECanonicalizeDoublyStridedOpPass(
     AMDAIECanonicalizeDoublyStridedOpOptions options = {});
@@ -176,6 +179,7 @@ std::unique_ptr<OperationPass<ModuleOp>> createAMDAIELoweringStrategyPass(
 std::unique_ptr<Pass> createAMDAIELowerFuncArgsPass();
 
 /// Create pass to lower from the AMDAIE dialect to the AIE/AIEX dialects.
+void addAMDAIEToAIEPasses(OpPassManager &);
 std::unique_ptr<Pass> createAMDAIELowerToAIEPass();
 
 /// Create pass to lower a sequence of operation(s) to a iree_codegen.ukernel.*
@@ -211,6 +215,8 @@ std::unique_ptr<Pass> createAMDAIEPadPass(AMDAIEPadOptions options = {});
 std::unique_ptr<Pass> createAMDAIEPeelForLoopPass(
     AMDAIEPeelForLoopOptions options = {});
 
+std::unique_ptr<Pass> createAMDAIESinkIntoCorePass();
+
 /// Create pass to tile TilingInterface operations.
 std::unique_ptr<Pass> createAMDAIETilePass(AMDAIETileOptions options = {});
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
index 2dec5f951..4687542ad 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
@@ -26,7 +26,7 @@ def AMDAIEAIRDmaToAMDAIEDma :
 def AMDAIEAssignLogicalObjectFifoDepth :
     Pass<"iree-amdaie-assign-logical-objectfifo-depth", ""> {
   let summary = "Assign a buffer depth of the logical objectfifos.";
-  let constructor = 
+  let constructor =
     "mlir::iree_compiler::AMDAIE::createAMDAIEAssignLogicalObjectFifoDepthPass()";
   let options = [
     Option<"l3BufferDepth", "l3-buffer-depth", "int64_t", /*default=*/"1",
@@ -92,6 +92,23 @@ def AMDAIECanonicalizeDoublyStridedOp :
   ];
 }
 
+
+def AMDAIECanonicalizeNpuDmaCpyNd :
+  Pass<"iree-amdaie-canonicalize-npu-dma-cpy-nd", "ModuleOp"> {
+  let summary = "Canonicalize npu.dma_cpy_nd operations.";
+let description = [{
+  Canonicalize the offsets/sizes/strides of npu.dma_cpy_nd operations on the L3
+  side of the data movement, to make them more representative of the DMA hardware.
+  This pass ensures they offsets/sizes/strides are of size 'nb-dimensions' (default 4),
+  and that no dimensions inside dimensions of size>1 have stride=0.
+}];
+  let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIECanonicalizeNpuDmaCpyNdPass()";
+  let options = [
+    Option<"nbDimensions", "nb-dimensions", "uint64_t", /*default=*/"4",
+      "The number of dimensions the canonicalized offsets/sizes/strides must have.">
+  ];
+}
+
 def AMDAIECleanup :
     InterfacePass<"iree-amdaie-cleanup", "mlir::FunctionOpInterface"> {
   let summary = "Pass to invoke several cleanup and canonicalization patterns.";
@@ -430,6 +447,23 @@ def AMDAIEPropagateDataLayout :
   let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEPropagateDataLayoutPass()";
 }
 
+def AMDAIESinkIntoCore :
+  Pass<"iree-amdaie-sink-into-core", "ModuleOp"> {
+  let summary = "Clone constants and other ops into amdaie.cores";
+  let description = [{
+   The amdaie.core operation should be isolated from above for code generation.
+   This pass finds operations outside of cores, whose values are used inside of
+   cores, and creates clones of them inside of cores. Operations in the amdaie
+   dialect are not sunk into cores, as they are assumed to be data movement
+   related ops which should be kept outside of cores.
+
+   In the future, we should formalize the isolation of cores using upstream
+   MLIR attributes/ideas.
+  }];
+  let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESinkIntoCorePass()";
+}
+
+
 def AMDAIETile :
     InterfacePass<"iree-amdaie-tile", "mlir::FunctionOpInterface"> {
   let summary = "Pass to tile TilingInterface operations.";
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
index cf96a5383..affb368c6 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
@@ -17,6 +17,7 @@ iree_lit_test_suite(
     "bufferize_to_allocation.mlir"
     "canonicalize_dma.mlir"
     "canonicalize_doubly_strided_op.mlir"
+    "canonicalize_npu_dma_cpy_nd.mlir"
     "combine_strided_ops.mlir"
     "controlcode_loop_unrolling.mlir"
     "convert_core_forall_to_for.mlir"
@@ -57,6 +58,7 @@ iree_lit_test_suite(
     "pad.mlir"
     "peel_for_loop.mlir"
     "propagate_data_layout.mlir"
+    "sink_into_core.mlir"
     "tile_and_fuse_using_scf_for.mlir"
     "tile_and_fuse_using_scf_forall.mlir"
     "tile_copy_using_scf_for.mlir"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_npu_dma_cpy_nd.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_npu_dma_cpy_nd.mlir
new file mode 100644
index 000000000..1c5ffc27a
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_npu_dma_cpy_nd.mlir
@@ -0,0 +1,126 @@
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-amdaie-canonicalize-npu-dma-cpy-nd)" --verify-diagnostics %s | FileCheck %s
+
+module {
+  func.func @npu_dma_cpy_nd_with_invalid_repeat(
+     %arg0: index,
+     %arg1: !amdaie.logicalobjectfifo<memref<2048xi32>>,
+     %arg2: !amdaie.logicalobjectfifo<memref<1024xi32, 1>>) {
+    amdaie.workgroup {
+      %0 = amdaie.circular_dma_cpy_nd(%arg1[] [] [], %arg2[] [] []) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
+      amdaie.controlcode {
+         // expected-error @+1 {{'amdaie.npu.dma_cpy_nd' op might have stride=0 in dimension 2, and size>1 in dimension 1. As 1 < 2, this cannot be supported -- the zero stride cannot be moved to the outer-most (slowest) dimension, as required by current AIE architecture.}}
+        %1 = amdaie.npu.dma_cpy_nd %0([0, 0, 0, 32] [1, 32, 2, 32] [128, 64, 0, 1] bd_id = %arg0, [] [] [])
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+module {
+  func.func @npu_dma_cpy_nd_with_multiple_repeats(
+     %arg0: index,
+     %arg1: !amdaie.logicalobjectfifo<memref<2048xi32>>,
+     %arg2: !amdaie.logicalobjectfifo<memref<1024xi32, 1>>) {
+    amdaie.workgroup {
+      %0 = amdaie.circular_dma_cpy_nd(%arg1[] [] [], %arg2[] [] []) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
+      amdaie.controlcode {
+         // expected-error @+1 {{'amdaie.npu.dma_cpy_nd' op might have stride=0 in dimension 1, and size>1 in dimension 0. As 0 < 1, this cannot be supported -- the zero stride cannot be moved to the outer-most (slowest) dimension, as required by current AIE architecture.}}
+        %1 = amdaie.npu.dma_cpy_nd %0([0, 0, 0, 32] [2, 8, 2, 32] [0, 0, 64, 1] bd_id = %arg0, [] [] [])
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+module {
+  func.func @controlcode_invalid_implicit_l3_memref(
+     %arg0: index,
+     %arg1: !amdaie.logicalobjectfifo<memref<2048xi32>>,
+     %arg2: !amdaie.logicalobjectfifo<memref<1024xi32, 1>>) {
+    amdaie.workgroup {
+      %0 = amdaie.circular_dma_cpy_nd(%arg1[] [] [], %arg2[] [] []) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
+      amdaie.controlcode {
+        // expected-error @+1 {{'amdaie.npu.dma_cpy_nd' op has target in L3, but does not have target addressing. Target addressing is required to canonicalize}}
+        %1 = amdaie.npu.dma_cpy_nd %0([] [] [] bd_id = %arg0, [] [] [])
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+module {
+  // CHECK-LABEL: func @controlcode_rank_4_destination
+  func.func @controlcode_rank_4_destination(
+     %arg0: index,
+     %arg1: !amdaie.logicalobjectfifo<memref<2048xi32>>,
+     %arg2: !amdaie.logicalobjectfifo<memref<1024xi32, 1>>) {
+    amdaie.workgroup {
+      %0 = amdaie.circular_dma_cpy_nd(%arg1[] [] [], %arg2[] [] []) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
+      // CHECK: controlcode
+      amdaie.controlcode {
+        // CHECK: amdaie.npu.dma_cpy_nd
+        // CHECK-SAME: [0, 0, 0, 0] [1, 1, 1, 10] [0, 0, 0, 1]
+        %1 = amdaie.npu.dma_cpy_nd %0([0] [10] [1] bd_id = %arg0, [] [] [])
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+module {
+  // CHECK-LABEL: func @controlcode_rank_4_source
+  func.func @controlcode_rank_4_source(
+     %arg0: index,
+     %arg1: !amdaie.logicalobjectfifo<memref<2048xi32>>,
+     %arg2: !amdaie.logicalobjectfifo<memref<1024xi32, 1>>) {
+    amdaie.workgroup {
+      %0 = amdaie.circular_dma_cpy_nd(%arg2[] [] [], %arg1[] [] []) : (
+      !amdaie.logicalobjectfifo<memref<1024xi32, 1>>,
+      !amdaie.logicalobjectfifo<memref<2048xi32>>)
+      // CHECK: controlcode
+      amdaie.controlcode {
+        // CHECK: amdaie.npu.dma_cpy_nd
+        // CHECK-SAME: [0, 0, 0, 0] [1, 1, 1, 10] [0, 0, 0, 1]
+        %1 = amdaie.npu.dma_cpy_nd %0([] [] [] bd_id = %arg0, [0] [10] [1])
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+module {
+  // CHECK-LABEL: func @stride_zero_front
+  func.func @stride_zero_front(
+     %arg0: index,
+     %arg1: !amdaie.logicalobjectfifo<memref<2048xi32>>,
+     %arg2: !amdaie.logicalobjectfifo<memref<1024xi32, 1>>) {
+    amdaie.workgroup {
+      %0 = amdaie.circular_dma_cpy_nd(%arg2[] [] [], %arg1[] [] []) : (
+      !amdaie.logicalobjectfifo<memref<1024xi32, 1>>,
+      !amdaie.logicalobjectfifo<memref<2048xi32>>)
+      // CHECK: controlcode
+      amdaie.controlcode {
+        // CHECK: amdaie.npu.dma_cpy_nd
+        // CHECK-SAME: [3, 1, 2, 4] [10, 1, 1, 12] [0, 100, 200, 300]
+        %1 = amdaie.npu.dma_cpy_nd %0([] [] [] bd_id = %arg0, [1, 2, 3, 4] [1, 1, 10, 12] [100, 200, 0, 300])
+        amdaie.end
+      }
+    }
+    return
+  }
+}
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir
index 9f5b2f6aa..72963a427 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir
@@ -223,8 +223,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK-DAG:   func.func private @ukernel_A(memref<i32, 2>, index) attributes {llvm.bareptr = true}
 // CHECK-DAG:   func.func private @ukernel_B(memref<i32, 2>, index, memref<f32, 2>, index) attributes {llvm.bareptr = true}
 // CHECK-DAG:   %[[TILE_0_2:.+]] = aie.tile(0, 2)
+// CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
 // CHECK:       aie.core(%[[TILE_0_2]])
-// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
 // CHECK:         %[[ACQUIRE:.+]] = aie.objectfifo.acquire
 // CHECK-SAME:    Produce
 // CHECK:         %[[ACCESS:.+]] = aie.objectfifo.subview.access %[[ACQUIRE]]
@@ -295,6 +295,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
+
 // NOTE: Due to an AIE check that verifies whether AIE operations exist inside a
 // core, it's hard to create a very small minimal test.
 //
@@ -431,7 +432,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       // expected-error @+1 {{could not convert to AIEDialect ops}}
       amdaie.controlcode {
         %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
-        // expected-error @+1 {{op expected to have a target BD ID op}}
+        // expected-error @+1 {{'amdaie.npu.dma_cpy_nd' op must have a target BD ID op to lower to the AIE dialect}}
         %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[%c0, %c32] [%c32, %c32] [%c64, %c1], [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
         amdaie.npu.dma_wait(%npu_dma_0, S2MM)
         amdaie.end
@@ -443,178 +444,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
-  #hal.descriptor_set.layout<0, bindings = [
-    #hal.descriptor_set.binding<2, storage_buffer>
-  ]>
-]>
-#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
-module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @npu_dma_cpy_nd_invalid_addressing() {
-    amdaie.workgroup {
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 1 : index
-      %c2 = arith.constant 2 : index
-      %c32 = arith.constant 32 : index
-      %c64 = arith.constant 64 : index
-      %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x64xi32>
-      memref.assume_alignment %2, 64 : memref<32x64xi32>
-      %tile_0_0 = amdaie.tile(%c0, %c0)
-      %tile_0_1 = amdaie.tile(%c0, %c1)
-      %tile_0_2 = amdaie.tile(%c0, %c2)
-      %bd_id_0 = amdaie.bd_id(%tile_0_0, 0)
-      %alloc_1 = memref.alloc() : memref<32x32xi32, 1>
-      %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2>
-      %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<2048xi32>>
-      %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
-      %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-      %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<1024xi32, 2>>)
-      %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
-      amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] ()
-      memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2>
-      memref.dealloc %alloc_1 : memref<32x32xi32, 1>
-      // expected-error @+1 {{could not convert to AIEDialect ops}}
-      amdaie.controlcode {
-        %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
-        // expected-error @+1 {{op expected target addressing for DMA with target on L3}}
-        %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[] [] [] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
-        amdaie.npu.dma_wait(%npu_dma_0, S2MM)
-        amdaie.end
-      }
-    }
-    return
-  }
-}
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
-  #hal.descriptor_set.layout<0, bindings = [
-    #hal.descriptor_set.binding<2, storage_buffer>
-  ]>
-]>
-#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
-module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @npu_dma_cpy_nd_with_invalid_repeat() {
-    amdaie.workgroup {
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 1 : index
-      %c2 = arith.constant 2 : index
-      %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x64xi32>
-      memref.assume_alignment %2, 64 : memref<32x64xi32>
-      %tile_0_0 = amdaie.tile(%c0, %c0)
-      %tile_0_1 = amdaie.tile(%c0, %c1)
-      %tile_0_2 = amdaie.tile(%c0, %c2)
-      %bd_id_0 = amdaie.bd_id(%tile_0_0, 0)
-      %alloc_1 = memref.alloc() : memref<32x32xi32, 1>
-      %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2>
-      %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<2048xi32>>
-      %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
-      %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-      %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<1024xi32, 2>>)
-      %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
-      amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] ()
-      memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2>
-      memref.dealloc %alloc_1 : memref<32x32xi32, 1>
-      // expected-error @+1 {{could not convert to AIEDialect ops}}
-      amdaie.controlcode {
-        %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
-        // expected-error @+1 {{could not canonicalize for AIE}}
-        %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[0, 0, 0, 32] [1, 32, 2, 32] [0, 64, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
-        amdaie.end
-      }
-    }
-    return
-  }
-}
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
-  #hal.descriptor_set.layout<0, bindings = [
-    #hal.descriptor_set.binding<2, storage_buffer>
-  ]>
-]>
-#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
-module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @npu_dma_cpy_nd_with_multiple_repeat() {
-    amdaie.workgroup {
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 1 : index
-      %c2 = arith.constant 2 : index
-      %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x64xi32>
-      memref.assume_alignment %2, 64 : memref<32x64xi32>
-      %tile_0_0 = amdaie.tile(%c0, %c0)
-      %tile_0_1 = amdaie.tile(%c0, %c1)
-      %tile_0_2 = amdaie.tile(%c0, %c2)
-      %bd_id_0 = amdaie.bd_id(%tile_0_0, 0)
-      %alloc_1 = memref.alloc() : memref<32x32xi32, 1>
-      %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2>
-      %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<2048xi32>>
-      %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
-      %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-      %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<1024xi32, 2>>)
-      %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
-      amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] ()
-      memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2>
-      memref.dealloc %alloc_1 : memref<32x32xi32, 1>
-      // expected-error @+1 {{could not convert to AIEDialect ops}}
-      amdaie.controlcode {
-        %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
-        // expected-error @+1 {{could not canonicalize for AIE}}
-        %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[0, 0, 0, 32] [2, 8, 2, 32] [0, 0, 64, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
-        amdaie.end
-      }
-    }
-    return
-  }
-}
-
-// -----
-
-// CHECK:       aie.device
-// CHECK:       aiex.runtime_sequence @npu_dma_cpy_nd_with_repeat(%[[ARG0:.+]]: memref<32x64xi32>
-// CHECK:       aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 32][2, 1, 1, 32][0, 0, 0, 1])
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
-  #hal.descriptor_set.layout<0, bindings = [
-    #hal.descriptor_set.binding<2, storage_buffer>
-  ]>
-]>
-#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
-module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @npu_dma_cpy_nd_with_repeat() {
-    amdaie.workgroup {
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 1 : index
-      %c2 = arith.constant 2 : index
-      %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x64xi32>
-      memref.assume_alignment %2, 64 : memref<32x64xi32>
-      %tile_0_0 = amdaie.tile(%c0, %c0)
-      %tile_0_1 = amdaie.tile(%c0, %c1)
-      %tile_0_2 = amdaie.tile(%c0, %c2)
-      %bd_id_0 = amdaie.bd_id(%tile_0_0, 0)
-      %alloc_1 = memref.alloc() : memref<32x32xi32, 1>
-      %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2>
-      %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<2048xi32>>
-      %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
-      %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-      %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<1024xi32, 2>>)
-      %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
-      amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] ()
-      memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2>
-      memref.dealloc %alloc_1 : memref<32x32xi32, 1>
-      amdaie.controlcode {
-        %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
-        %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[0, 0, 32] [1, 2, 32] [0, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
-        amdaie.end
-      }
-    }
-    return
-  }
-}
-
-// -----
-
 // CHECK:       aie.device
 // CHECK:       aiex.runtime_sequence @npu_dma_cpy_nd_with_repeat_already_on_outer_dim(%[[ARG0:.+]]: memref<32x64xi32>
 // CHECK:       aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 32][2, 1, 2, 32][2, 0, 16, 1])
@@ -689,50 +518,41 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK-SAME:            issue_token = true
 // CHECK-SAME:            metadata = @[[OBJ2]]
 // CHECK-NEXT:    aiex.npu.dma_wait {symbol = @[[OBJ2]]}
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
-  #hal.descriptor_set.layout<0, bindings = [
-    #hal.descriptor_set.binding<2, storage_buffer>
-  ]>
-]>
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<2, storage_buffer>]>]>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @controlcode() {
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     amdaie.workgroup {
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 1 : index
-      %c2 = arith.constant 2 : index
-      %c32 = arith.constant 32 : index
-      %c64 = arith.constant 64 : index
-      %c1024 = arith.constant 1024 : index
-      %c2048 = arith.constant 2048 : index
-      %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x64xi32>
-      memref.assume_alignment %2, 64 : memref<32x64xi32>
-      %tile_0_0 = amdaie.tile(%c0, %c0)
-      %tile_0_1 = amdaie.tile(%c0, %c1)
-      %tile_0_2 = amdaie.tile(%c0, %c2)
-      %bd_id_0 = amdaie.bd_id(%tile_0_0, 0)
-      %alloc_1 = memref.alloc() : memref<32x32xi32, 1>
+      %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x64xi32>
+      memref.assume_alignment %0, 64 : memref<32x64xi32>
+      %tile = amdaie.tile(%c0, %c0)
+      %tile_0 = amdaie.tile(%c0, %c1)
+      %tile_1 = amdaie.tile(%c0, %c2)
+      %bd_id = amdaie.bd_id(%tile, 0)
+      %alloc = memref.alloc() : memref<32x32xi32, 1>
       %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2>
-      %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<2048xi32>>
-      %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
-      %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-      %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<1024xi32, 2>>)
-      %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
-      %dma_source_l3 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %placeholder[] [] []) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<2048xi32>>)
-      %dma1 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
-      amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] ()
+      %1 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo<memref<2048xi32>>
+      %2 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_0} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
+      %3 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_1} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
+      %4 = amdaie.circular_dma_cpy_nd(%2[] [] [], %3[] [] []) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<1024xi32, 2>>)
+      %5 = amdaie.circular_dma_cpy_nd(%1[] [] [], %2[] [] []) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
+      %6 = amdaie.circular_dma_cpy_nd(%2[] [] [], %1[] [] []) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<2048xi32>>)
+      amdaie.logicalobjectfifo.link[%4] -> [%5] ()
       memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2>
-      memref.dealloc %alloc_1 : memref<32x32xi32, 1>
+      memref.dealloc %alloc : memref<32x32xi32, 1>
       amdaie.controlcode {
-        %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
-        %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[%c0, %c32] [%c32, %c32] [%c64, %c1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
-        amdaie.npu.dma_wait(%npu_dma_0, S2MM)
-        %npu_dma_1 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[%c0] [%c1024] [%c1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
-        amdaie.npu.dma_wait(%npu_dma_1, S2MM)
-        %npu_dma_2 = amdaie.npu.dma_cpy_nd %dma_source_l3([] [] [], %obj0[%c0, %c32] [%c32, %c32] [%c64, %c1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
-        amdaie.npu.dma_wait(%npu_dma_2, MM2S)
-        %npu_dma_3 = amdaie.npu.dma_cpy_nd %dma_source_l3([] [] [], %obj0[%c0] [%c2048] [%c1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
-        amdaie.npu.dma_wait(%npu_dma_3, MM2S)
+        %7 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<32x64xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %8 = amdaie.npu.dma_cpy_nd %5(%7[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%8, S2MM)
+        %9 = amdaie.npu.dma_cpy_nd %5(%7[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%9, S2MM)
+        %10 = amdaie.npu.dma_cpy_nd %6([] [] [], %7[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%10, MM2S)
+        %11 = amdaie.npu.dma_cpy_nd %6([] [] [], %7[0, 0, 0, 0] [1, 1, 1, 2048] [0, 0, 0, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%11, MM2S)
         amdaie.end
       }
     }
@@ -765,25 +585,13 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK-SAME:          %[[LHS]][0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]
 // CHECK-SAME:          metadata = @[[OBJ0]]
 // CHECK-SAME:          memref<32x32xbf16>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
-  #hal.descriptor_set.layout<0, bindings = [
-    #hal.descriptor_set.binding<0, storage_buffer>,
-    #hal.descriptor_set.binding<1, storage_buffer>,
-    #hal.descriptor_set.binding<2, storage_buffer>
-  ]>
-]>
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>]>]>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @bf16_f32_lit_test() {
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     amdaie.workgroup {
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 1 : index
-      %c2 = arith.constant 2 : index
-      %c32 = arith.constant 32 : index
-      %c16 = arith.constant 16 : index
-      %c512 = arith.constant 512 : index
-      %c256 = arith.constant 256 : index
-      %c1024 = arith.constant 1024 : index
       %alloc = memref.alloc() : memref<2x2x16x16xf32, 1 : i32>
       %alloc_0 = memref.alloc() : memref<1x2x32x16xbf16, 1 : i32>
       %tile = amdaie.tile(%c0, %c1)
@@ -794,28 +602,28 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %tile_1 = amdaie.tile(%c0, %c0)
       %tile_2 = amdaie.tile(%c1, %c0)
       %bd_id = amdaie.bd_id(%tile_1, 2)
-      %bd_id_2 = amdaie.bd_id(%tile_1, 1)
-      %bd_id_3 = amdaie.bd_id(%tile_1, 0)
-      %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_1} : !amdaie.logicalobjectfifo<memref<32x32xbf16>>
+      %bd_id_3 = amdaie.bd_id(%tile_1, 1)
+      %bd_id_4 = amdaie.bd_id(%tile_1, 0)
+      %4 = amdaie.logicalobjectfifo.placeholder{%tile_1} : !amdaie.logicalobjectfifo<memref<32x32xbf16>>
       memref.assume_alignment %3, 64 : memref<32x32xbf16>
       %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32x32xbf16>
-      %placeholder1 = amdaie.logicalobjectfifo.placeholder{%tile_2} : !amdaie.logicalobjectfifo<memref<32x32xbf16>>
+      %6 = amdaie.logicalobjectfifo.placeholder{%tile_2} : !amdaie.logicalobjectfifo<memref<32x32xbf16>>
       memref.assume_alignment %5, 64 : memref<32x32xbf16>
       %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x32xf32>
-      %placeholder2 = amdaie.logicalobjectfifo.placeholder{%tile_2} : !amdaie.logicalobjectfifo<memref<1024xf32>>
-      %9 = amdaie.circular_dma_cpy_nd(%2[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo<memref<2x1x16x32xbf16, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<32x32xbf16>>)
-      %10 = amdaie.circular_dma_cpy_nd(%1[] [] [], %placeholder1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x2x32x16xbf16, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<32x32xbf16>>)
-      %11 = amdaie.circular_dma_cpy_nd(%placeholder2[] [] [], %0[%c0, %c0, %c0, %c0] [%c2, %c16, %c2, %c16] [%c512, %c16, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<1024xf32>>, !amdaie.logicalobjectfifo<memref<2x2x16x16xf32, 1 : i32>, 2>)
+      %8 = amdaie.logicalobjectfifo.placeholder{%tile_2} : !amdaie.logicalobjectfifo<memref<1024xf32>>
+      %9 = amdaie.circular_dma_cpy_nd(%2[] [] [], %4[] [] []) : (!amdaie.logicalobjectfifo<memref<2x1x16x32xbf16, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<32x32xbf16>>)
+      %10 = amdaie.circular_dma_cpy_nd(%1[] [] [], %6[] [] []) : (!amdaie.logicalobjectfifo<memref<1x2x32x16xbf16, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<32x32xbf16>>)
+      %11 = amdaie.circular_dma_cpy_nd(%8[] [] [], %0[0, 0, 0, 0] [2, 16, 2, 16] [512, 16, 256, 1]) : (!amdaie.logicalobjectfifo<memref<1024xf32>>, !amdaie.logicalobjectfifo<memref<2x2x16x16xf32, 1 : i32>, 2>)
       amdaie.controlcode {
-        %obj0 = amdaie.logicalobjectfifo.from_memref %3, {%tile_1} : memref<32x32xbf16> -> !amdaie.logicalobjectfifo<memref<32x32xbf16>>
-        %obj1 = amdaie.logicalobjectfifo.from_memref %5, {%tile_1} : memref<32x32xbf16> -> !amdaie.logicalobjectfifo<memref<32x32xbf16>>
-        %obj2 = amdaie.logicalobjectfifo.from_memref %7, {%tile_1} : memref<32x32xf32> -> !amdaie.logicalobjectfifo<memref<1024xf32>>
-        %12 = amdaie.npu.dma_cpy_nd %11(%obj2[%c0] [%c1024] [%c1] bd_id = %bd_id_3, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<1024xf32>>
-        %13 = amdaie.npu.dma_cpy_nd %10([] [] [], %obj1[%c0, %c1, %c2] [%c2, %c32, %c16] [%c16, %c32, %c1] bd_id = %bd_id_2) : source_type = !amdaie.logicalobjectfifo<memref<32x32xbf16>>
-        %14 = amdaie.npu.dma_cpy_nd %9([] [] [], %obj0[%c0] [%c1024] [%c1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo<memref<32x32xbf16>>
-        amdaie.npu.dma_wait(%12, S2MM)
-        amdaie.npu.dma_wait(%13, MM2S)
-        amdaie.npu.dma_wait(%14, MM2S)
+        %12 = amdaie.logicalobjectfifo.from_memref %3, {%tile_1} : memref<32x32xbf16> -> !amdaie.logicalobjectfifo<memref<32x32xbf16>>
+        %13 = amdaie.logicalobjectfifo.from_memref %5, {%tile_1} : memref<32x32xbf16> -> !amdaie.logicalobjectfifo<memref<32x32xbf16>>
+        %14 = amdaie.logicalobjectfifo.from_memref %7, {%tile_1} : memref<32x32xf32> -> !amdaie.logicalobjectfifo<memref<1024xf32>>
+        %15 = amdaie.npu.dma_cpy_nd %11(%14[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_4, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<1024xf32>>
+        %16 = amdaie.npu.dma_cpy_nd %10([] [] [], %13[0, 0, 1, 2] [1, 2, 32, 16] [0, 16, 32, 1] bd_id = %bd_id_3) : source_type = !amdaie.logicalobjectfifo<memref<32x32xbf16>>
+        %17 = amdaie.npu.dma_cpy_nd %9([] [] [], %12[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo<memref<32x32xbf16>>
+        amdaie.npu.dma_wait(%15, S2MM)
+        amdaie.npu.dma_wait(%16, MM2S)
+        amdaie.npu.dma_wait(%17, MM2S)
         amdaie.end
       }
     }
@@ -823,52 +631,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
   }
 }
 
-// -----
 
-// Test to demonstrate invalid implicit L3 memref type that has rank greater than that
-// expected for static offsets/sizes/strides.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
-  #hal.descriptor_set.layout<0, bindings = [
-    #hal.descriptor_set.binding<2, storage_buffer>
-  ]>
-]>
-#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
-module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @controlcode_invalid_implicit_l3_memref() {
-    amdaie.workgroup {
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 1 : index
-      %c2 = arith.constant 2 : index
-      %c32 = arith.constant 32 : index
-      %c64 = arith.constant 64 : index
-      %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x16x64x128x32xi32>
-      memref.assume_alignment %2, 64 : memref<32x16x64x128x32xi32>
-      %tile_0_0 = amdaie.tile(%c0, %c0)
-      %tile_0_1 = amdaie.tile(%c0, %c1)
-      %tile_0_2 = amdaie.tile(%c0, %c2)
-      %bd_id_0 = amdaie.bd_id(%tile_0_0, 0)
-      %alloc_1 = memref.alloc() : memref<32x32xi32, 1>
-      %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2>
-      %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<32x16x64x128x32xi32>>
-      %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
-      %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-      %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<1024xi32, 2>>)
-      %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo<memref<32x16x64x128x32xi32>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
-      amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] ()
-      memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2>
-      memref.dealloc %alloc_1 : memref<32x32xi32, 1>
-      // expected-error @+1 {{could not convert to AIEDialect ops}}
-      amdaie.controlcode {
-        %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x16x64x128x32xi32> -> !amdaie.logicalobjectfifo<memref<32x16x64x128x32xi32>>
-        // expected-error @+1 {{op expected target addressing for DMA with target on L3}}
-        %npu_dma_1 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[] [] [] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<32x16x64x128x32xi32>>
-        amdaie.npu.dma_wait(%npu_dma_1, S2MM)
-        amdaie.end
-      }
-    }
-    return
-  }
-}
+
 
 // -----
 
@@ -877,15 +641,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK-DAG:   %[[TILE_0_2:.+]] = aie.tile(0, 2)
 // CHECK-DAG:   %[[TILE_0_1:.+]] = aie.tile(0, 1)
 // CHECK-DAG:   %[[TILE_0_0:.+]] = aie.tile(0, 0)
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
 // CHECK:       aie.objectfifo @[[OBJ0:.+]](%[[TILE_0_0]], {%[[TILE_0_1]]}
 // CHECK-NEXT:  aie.objectfifo @[[OBJ1:.+]](%[[TILE_0_1]], {%[[TILE_0_2]], %[[TILE_1_2]]}
-// CHECK-NEXT:  aie.objectfifo.link 
+// CHECK-NEXT:  aie.objectfifo.link
 // CHECK-SAME:  @[[OBJ0]]
 // CHECK-SAME:  @[[OBJ1]]
 // CHECK:       aie.core(%[[TILE_0_2]])
-// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:     %[[C8:.+]] = arith.constant 8 : index
 // CHECK:         %[[ACQUIRE_0:.+]] = aie.objectfifo.acquire @[[OBJ1]](Consume, 1)
 // CHECK:         %[[ACCESS_0:.+]] = aie.objectfifo.subview.access %[[ACQUIRE_0]]
 // CHECK:         %[[REINTERPRET_0:.+]] = memref.reinterpret_cast %[[ACCESS_0]]
@@ -896,9 +660,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK:         aie.objectfifo.release
 // CHECK-SAME:    @[[OBJ1]]
 // CHECK:       aie.core(%[[TILE_1_2]])
-// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:     %[[C8:.+]] = arith.constant 8 : index
 // CHECK:         %[[ACQUIRE_1:.+]] = aie.objectfifo.acquire @[[OBJ1]](Consume, 1)
 // CHECK:         %[[ACCESS_1:.+]] = aie.objectfifo.subview.access %[[ACQUIRE_1]]
 // CHECK:         %[[REINTERPRET_1:.+]] = memref.reinterpret_cast %[[ACCESS_1]]
@@ -919,66 +680,61 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK-SAME:    @[[OBJ0]]
 // CHECK-NEXT:    aiex.npu.dma_wait
 // CHECK-SAME:    @[[OBJ0]]
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
-  #hal.descriptor_set.layout<0, bindings = [
-    #hal.descriptor_set.binding<2, storage_buffer>
-  ]>
-]>
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<2, storage_buffer>]>]>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @large_example() {
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
     amdaie.workgroup {
-      %c0 = arith.constant 0 : index
-      %c0_i32 = arith.constant 0 : i32
-      %c1 = arith.constant 1 : index
-      %c2 = arith.constant 2 : index
-      %c8 = arith.constant 8 : index
-      %c32 = arith.constant 32 : index
-      %c64 = arith.constant 64 : index
-      %tile_0_0 = amdaie.tile(%c0, %c0)
-      %tile_0_1 = amdaie.tile(%c0, %c1)
-      %tile_0_2 = amdaie.tile(%c0, %c2)
-      %tile_1_2 = amdaie.tile(%c1, %c2)
-      %bd_id_0 = amdaie.bd_id(%tile_0_0, 0)
+      %tile = amdaie.tile(%c0, %c0)
+      %tile_0 = amdaie.tile(%c0, %c1)
+      %tile_1 = amdaie.tile(%c0, %c2)
+      %tile_2 = amdaie.tile(%c1, %c2)
+      %bd_id = amdaie.bd_id(%tile, 0)
       %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x64xi32>
       memref.assume_alignment %0, 64 : memref<32x64xi32>
-      %alloc_1 = memref.alloc() : memref<32x32xi32, 1>
-      %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2>
-      %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<2048xi32>>
-      %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
-      %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2, %tile_1_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-      %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %placeholder[] [] []) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<2048xi32>>)
-      %dma1 = amdaie.circular_dma_cpy_nd(%obj2[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo<memref<1024xi32, 2>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
-      amdaie.logicalobjectfifo.link[%dma0] -> [%dma1] ()
-      %core_0_2 = amdaie.core(%tile_0_2, in : [%dma1], out : []) {
-        %1 = amdaie.logicalobjectfifo.acquire(%dma1, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-        %2 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo<memref<1024xi32, 2>> -> memref<1024xi32, 2>
-        %3 = memref.reinterpret_cast %2 to offset: [0], sizes: [4, 8, 4, 8], strides: [256, 32, 8, 1] : memref<1024xi32, 2> to memref<4x8x4x8xi32, 2>
-        scf.for %arg2 = %c0 to %c8 step %c1  {
-          linalg.fill ins(%c0_i32 : i32) outs(%3 : memref<4x8x4x8xi32, 2>)
+      %alloc = memref.alloc() : memref<32x32xi32, 1>
+      %alloc_3 = memref.alloc() : memref<4x8x4x8xi32, 2>
+      %1 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo<memref<2048xi32>>
+      %2 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_0} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
+      %3 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_1, %tile_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
+      %4 = amdaie.circular_dma_cpy_nd(%2[] [] [], %1[] [] []) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<2048xi32>>)
+      %5 = amdaie.circular_dma_cpy_nd(%3[] [] [], %2[] [] []) : (!amdaie.logicalobjectfifo<memref<1024xi32, 2>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
+      amdaie.logicalobjectfifo.link[%4] -> [%5] ()
+      %6 = amdaie.core(%tile_1, in : [%5], out : []) {
+        %8 = amdaie.logicalobjectfifo.acquire(%5, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
+        %9 = amdaie.logicalobjectfifo.access(%8, Read) : !amdaie.logicalobjectfifo<memref<1024xi32, 2>> -> memref<1024xi32, 2>
+        %reinterpret_cast = memref.reinterpret_cast %9 to offset: [0], sizes: [4, 8, 4, 8], strides: [256, 32, 8, 1] : memref<1024xi32, 2> to memref<4x8x4x8xi32, 2>
+        scf.for %arg0 = %c0 to %c8 step %c1 {
+          linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast : memref<4x8x4x8xi32, 2>)
         }
-        amdaie.logicalobjectfifo.release(%dma1, Consume) {size = 1 : i32}
+        amdaie.logicalobjectfifo.release(%5, Consume) {size = 1 : i32}
         amdaie.end
       }
-      %core_1_2 = amdaie.core(%tile_1_2, in : [%dma1], out : []) {
-        %1 = amdaie.logicalobjectfifo.acquire(%dma1, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-        %2 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo<memref<1024xi32, 2>> -> memref<1024xi32, 2>
-        %3 = memref.reinterpret_cast %2 to offset: [0], sizes: [4, 8, 4, 8], strides: [256, 32, 8, 1] : memref<1024xi32, 2> to memref<4x8x4x8xi32, 2>
-        scf.for %arg2 = %c0 to %c8 step %c1  {
-          linalg.fill ins(%c0_i32 : i32) outs(%3: memref<4x8x4x8xi32, 2>)
+      %7 = amdaie.core(%tile_2, in : [%5], out : []) {
+        %8 = amdaie.logicalobjectfifo.acquire(%5, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
+        %9 = amdaie.logicalobjectfifo.access(%8, Read) : !amdaie.logicalobjectfifo<memref<1024xi32, 2>> -> memref<1024xi32, 2>
+        %reinterpret_cast = memref.reinterpret_cast %9 to offset: [0], sizes: [4, 8, 4, 8], strides: [256, 32, 8, 1] : memref<1024xi32, 2> to memref<4x8x4x8xi32, 2>
+        scf.for %arg0 = %c0 to %c8 step %c1 {
+          linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast : memref<4x8x4x8xi32, 2>)
         }
-        amdaie.logicalobjectfifo.release(%dma1, Consume) {size = 1 : i32}
+        amdaie.logicalobjectfifo.release(%5, Consume) {size = 1 : i32}
         amdaie.end
       }
-      memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2>
-      memref.dealloc %alloc_1 : memref<32x32xi32, 1>
+      memref.dealloc %alloc_3 : memref<4x8x4x8xi32, 2>
+      memref.dealloc %alloc : memref<32x32xi32, 1>
       amdaie.controlcode {
-        %obj0 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
-        %npu_dma = amdaie.npu.dma_cpy_nd %dma0([] [] [], %obj0[%c0, %c32] [%c32, %c32] [%c64, %c1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
-        amdaie.npu.dma_wait(%npu_dma, MM2S)
+        %8 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<32x64xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %9 = amdaie.npu.dma_cpy_nd %4([] [] [], %8[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%9, MM2S)
         amdaie.end
       }
     }
     return
   }
 }
+
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/sink_into_core.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/sink_into_core.mlir
new file mode 100644
index 000000000..78edbc23e
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/sink_into_core.mlir
@@ -0,0 +1,25 @@
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-amdaie-sink-into-core)" %s | FileCheck %s
+
+module {
+
+  // CHECK-LABEL: func @f0
+  func.func @f0(%arg0: index) {
+    %c3 = arith.constant 3 : index
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %0 = arith.addi %arg0, %c3 : index
+    %tile = amdaie.tile(%c0, %c2)
+
+    // CHECK: amdaie.core
+    %1 = amdaie.core(%tile, in : [], out : []) {
+
+      // CHECK: arith.constant 3 : index
+      // CHECK: arith.addi 
+      // CHECK: linalg.fill 
+      %alloc = memref.alloc() : memref<2x2xindex>
+      linalg.fill ins(%0 : index) outs(%alloc : memref<2x2xindex>)
+      amdaie.end
+    }
+    return
+  }
+}
diff --git a/tests/samples/matmul_peeled_objectfifo.mlir b/tests/samples/matmul_peeled_objectfifo.mlir
index 013bc863d..c0a3dbf2a 100644
--- a/tests/samples/matmul_peeled_objectfifo.mlir
+++ b/tests/samples/matmul_peeled_objectfifo.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-pack-to-dma,air-copy-to-dma,iree-amdaie-air-dma-to-amdaie-dma,iree-amdaie-insert-cores,cse,iree-amdaie-localize-logicalobjectfifo,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize,iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-access-to-acquire-release,cse,canonicalize,iree-amdaie-dma-loop-subsumption,cse,canonicalize,iree-amdaie-assign-npu-dma-bd-ids,iree-amdaie-controlcode-loop-unroll,cse,canonicalize,iree-amdaie-create-logical-objectfifo-link,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-lower-to-aie,canonicalize)" --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-pack-to-dma,air-copy-to-dma,iree-amdaie-air-dma-to-amdaie-dma,iree-amdaie-insert-cores,cse,iree-amdaie-localize-logicalobjectfifo,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize,iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-access-to-acquire-release,cse,canonicalize,iree-amdaie-dma-loop-subsumption,cse,canonicalize,iree-amdaie-assign-npu-dma-bd-ids,iree-amdaie-controlcode-loop-unroll,cse,canonicalize,iree-amdaie-create-logical-objectfifo-link,iree-amdaie-canonicalize-doubly-strided-op,canonicalize,iree-amdaie-canonicalize-npu-dma-cpy-nd,canonicalize,iree-amdaie-sink-into-core,canonicalize,iree-amdaie-lower-to-aie,canonicalize)" --split-input-file %s | FileCheck %s
 
 // CHECK:       aie.device(npu1_4col)
 // CHECK-DAG:   %[[TILE_0_2:.+]] = aie.tile(0, 2)
diff --git a/tests/samples/matmul_peeled_objectfifo_e2e.mlir b/tests/samples/matmul_peeled_objectfifo_e2e.mlir
index 484494045..06cf9de04 100644
--- a/tests/samples/matmul_peeled_objectfifo_e2e.mlir
+++ b/tests/samples/matmul_peeled_objectfifo_e2e.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources --iree-amdaie-target-device=npu1_4col %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-lower-to-aie-pipeline=objectFifo --iree-amdaie-tile-pipeline=pack-peel --split-input-file | FileCheck %s
+// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources --iree-amdaie-target-device=npu1_4col %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-lower-to-aie-pipeline=objectFifo --iree-amdaie-tile-pipeline=pack-peel --mlir-print-ir-before-all --split-input-file | FileCheck %s
 
 // CHECK-LABEL: hal.executable.export public @matmul_i32_dispatch_0_matmul_128x128x256_i32