From 0ee892ff4e22a64bd527100ca93c999a8d616db6 Mon Sep 17 00:00:00 2001
From: Jorn Tuyls <jtuyls@users.noreply.github.com>
Date: Tue, 9 Jul 2024 19:55:45 +0200
Subject: [PATCH] Add DMA loop subsumption transformation (#512)

Addresses https://github.com/nod-ai/iree-amd-aie/issues/495.

DMA loop iteration subsumption tries to move scf.for loops inside the
DMA operations by updating the DMA access patterns and hoisting them out
of the loop. See the issue above for more details.
---
 .../AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td      |  65 +-
 .../iree-amd-aie/IR/AMDAIETargetModel.h       |  63 ++
 .../AMD-AIE/iree-amd-aie/IR/CMakeLists.txt    |   1 +
 .../Transforms/AMDAIEDmaLoopSubsumption.cpp   | 433 +++++++++
 .../iree-amd-aie/Transforms/CMakeLists.txt    |   1 +
 .../iree-amd-aie/Transforms/PassDetail.h      |   1 +
 .../AMD-AIE/iree-amd-aie/Transforms/Passes.h  |   4 +
 .../AMD-AIE/iree-amd-aie/Transforms/Passes.td |   6 +
 .../Transforms/test/CMakeLists.txt            |   1 +
 .../Transforms/test/dma_loop_subsumption.mlir | 850 ++++++++++++++++++
 tests/samples/matmul_peeled_objectfifo.mlir   |   2 +-
 11 files changed, 1403 insertions(+), 24 deletions(-)
 create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETargetModel.h
 create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp
 create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
index 1bdccedc0..15c363603 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
@@ -241,28 +241,6 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd",
   let extraClassDeclaration = [{
     // Check whether this dma operation has a wait user.
     bool hasDmaWaitOpUser();
-
-    // Helper method to return the memory space of source as an integer. If
-    // no memory space attribute, this indicates a global memory space and
-    // we return 0. Else cast the memory space attribute to an integer. 
-    uint64_t getSourceMemorySpaceAsUInt() {
-      MemRefType sourceMemrefType =
-          cast<LogicalObjectFifoType>(getDmaCpyNdOp().getSourceType())
-          .getElementType();
-      Attribute memSpace = sourceMemrefType.getMemorySpace();
-      return memSpace ? dyn_cast<IntegerAttr>(memSpace).getInt() : 0;
-    }
-    
-    // Helper method to return the memory space of target as an integer. If
-    // no memory space attribute, this indicates a global memory space and
-    // we return 0. Else cast the memory space attribute to an integer. 
-    uint64_t getTargetMemorySpaceAsUInt() {
-      MemRefType targetMemrefType =
-          cast<LogicalObjectFifoType>(getDmaCpyNdOp().getTargetType())
-          .getElementType();
-      Attribute memSpace = targetMemrefType.getMemorySpace();
-      return memSpace ? dyn_cast<IntegerAttr>(memSpace).getInt() : 0;
-    }
     
     // Check whether this operation has addressing on the source side.
     bool hasSourceAddressing() {
@@ -278,6 +256,47 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd",
     CircularDmaCpyNdOp getDmaCpyNdOp() {
       return dyn_cast<CircularDmaCpyNdOp>(getDma().getDefiningOp());
     }
+
+    // Return the source memref type. This is retrieved using information from
+    // the input DMA operation.
+    MemRefType getSourceMemrefType() { 
+      return cast<LogicalObjectFifoType>(getDmaCpyNdOp().getSourceType())
+          .getElementType();
+    }
+
+    // Return the source memory space as an attribute.
+    Attribute getSourceMemorySpace() {
+      return getSourceMemrefType().getMemorySpace();
+    }
+
+    // Helper method to return the source memory space as an integer. If no 
+    // memory space attribute, this indicates a global memory space and we
+    // return 0. Else cast the memory space attribute to an integer. 
+    uint64_t getSourceMemorySpaceAsUInt() {
+      Attribute memSpace = getSourceMemorySpace();
+      return memSpace ? cast<IntegerAttr>(memSpace).getInt() : 0;
+    }
+
+    // Return the target memref type. This is retrieved using information from
+    // the input DMA operation.
+    MemRefType getTargetMemrefType() { 
+      return cast<LogicalObjectFifoType>(getDmaCpyNdOp().getTargetType())
+          .getElementType();
+    }
+
+    // Return the target memory space as an attribute.
+    Attribute getTargetMemorySpace() {
+      return getTargetMemrefType().getMemorySpace();
+    }
+
+    // Helper method to return the target memory space as an integer. If no
+    // memory space attribute, this indicates a global memory space and we
+    // return 0. Else cast the memory space attribute to an integer. 
+    uint64_t getTargetMemorySpaceAsUInt() {
+      Attribute memSpace = getTargetMemorySpace();
+      return memSpace ? cast<IntegerAttr>(memSpace).getInt() : 0;
+    }
+
     // A utility to create a new doubly strided operation from this one with a
     // new set of source and target offsets, sizes and strides.
     DoublyStridedOpInterface createDoublyStridedOp(
@@ -519,7 +538,7 @@ def AMDAIE_LogicalObjectFifoFromMemrefOp
     // Else cast the memory space attribute to an integer. 
     uint64_t getMemorySpaceAsUInt() {
       Attribute memSpace = getMemorySpace();
-      return memSpace ? dyn_cast<IntegerAttr>(memSpace).getInt() : 0;
+      return memSpace ? cast<IntegerAttr>(memSpace).getInt() : 0;
     }
 
     // Return the source memref type.
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETargetModel.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETargetModel.h
new file mode 100644
index 000000000..a50c8d0f8
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETargetModel.h
@@ -0,0 +1,63 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// In absence of a complete hardware model interface, this file contains some
+// constants to describe hardware-related parameters used in transformations.
+// This is meant to be temporary.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef IREE_COMPILER_AMDAIE_TARGET_MODEL_H_
+#define IREE_COMPILER_AMDAIE_TARGET_MODEL_H_
+
+namespace mlir::iree_compiler::AMDAIE {
+
+//===----------------------------------------------------------------------===//
+//
+// DMA iteration dimensions
+//
+// DMAs support multi-dimensional addressing through buffer descriptors in two
+// ways:
+// 1. Intra-iteration access pattern. Specified via 'strides' ('steps' in buffer
+// descriptor lingo), 'sizes' ('wraps' in buffer descriptro lingo) and
+// 'padding'. When a DMA executes a buffer descriptor, it will access the data
+// (read/write) as specified by the intra-iteration access pattern.
+// 2. Inter-iteration access pattern. Specified via an iteration 'stride',
+// 'size' and 'current_iteration' ('stride' is the same as 'stepsize' and 'size'
+// is the same as 'wrap' in buffer descriptor lingo). Here, 'current_iteration'
+// keeps track of the current execution iteration of the buffer descriptor and
+// is incremented after buffer descriptor execution. the 'stride' is the offset
+// to be used for each execution of the buffer descriptor, relative to the
+// previous one. When 'iteration_current' is equal to 'size', the
+// 'iteration_current' is reset to zero.
+//
+// Although all DMAs use the same buffer descriptor format to describe the
+// execution configuration, the intra-iteration and inter-dimensions are
+// typically used for different purposes on different DMAs. See for example the
+// usage of these constants inside the DMA loop subsumption pass.
+//
+//===----------------------------------------------------------------------===//
+
+/// Shim DMAs support 3 intra-iteration dimensions + 1 inter-iteration
+/// dimension.
+static const int64_t kAMDAIEShimDmaNbIntraDims = 3;
+static const int64_t kAMDAIEShimDmaNbInterDims = 1;
+
+/// MemTile DMAs support 4 intra-iteration dimensions + 1 inter-iteration
+/// dimension.
+static const int64_t kAMDAIEMemTileDmaNbIntraDims = 4;
+static const int64_t kAMDAIEMemTileDmaNbInterDims = 1;
+
+/// Core DMAs support 3 intra-iteration dimensions + 1 inter-iteration
+/// dimension.
+static const int64_t kAMDAIECoreDmaNbIntraDims = 3;
+static const int64_t kAMDAIECoreDmaNbInterDims = 1;
+
+}  // namespace mlir::iree_compiler::AMDAIE
+
+#endif  // IREE_COMPILER_AMDAIE_TARGET_MODEL_H_
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt
index c669deaca..6cc4b4c05 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt
@@ -15,6 +15,7 @@ iree_cc_library(
     "AMDAIEAttrs.h"
     "AMDAIEDialect.h"
     "AMDAIEOps.h"
+    "AMDAIETargetModel.h"
     "AMDAIETypes.h"
   TEXTUAL_HDRS
     "AMDAIEAttrs.cpp.inc"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp
new file mode 100644
index 000000000..6c774b593
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp
@@ -0,0 +1,433 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the transformation that subsumes a loop iteration into a
+// DMA access pattern if possible. This adds an additional dimension to the
+// DMA's access pattern and hoits the DMA operation out of the loop. This
+// transformation is possible if:
+//
+// - The loop's bounds and step size are all constants.
+// - The DMA is only operated on once within the loop's scope. Otherwise,
+//   subsumbtion of the loop iteration into the DMA can change the temporal
+//   behaviour of the program.
+// - The DMA has additional available access pattern dimensions. This
+//   information is retrieved from a target hardware model.
+//
+//===----------------------------------------------------------------------===//
+
+#include "iree-amd-aie/IR/AMDAIEDialect.h"
+#include "iree-amd-aie/IR/AMDAIEOps.h"
+#include "iree-amd-aie/IR/AMDAIETargetModel.h"
+#include "iree-amd-aie/Transforms/Passes.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/AffineExprVisitor.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#define DEBUG_TYPE "iree-amdaie-dma-loop-subsumption"
+
+namespace mlir::iree_compiler::AMDAIE {
+
+namespace {
+
+/// Return an ancestor of 'op' in 'block', or nullptr if no such ancestor.
+Operation *getAncestorInBlock(Operation *op, Block *block) {
+  if (!op || !block) return nullptr;
+  auto parent = op;
+  while (parent && (parent->getBlock() != block))
+    parent = parent->getParentOp();
+  return parent;
+}
+
+/// Utility affine expression visitor to retrieve the stride from the
+/// expression.
+struct RetrieveStrideSize : public AffineExprVisitor<RetrieveStrideSize> {
+  std::optional<int64_t> stride;
+  void visitMulExpr(AffineBinaryOpExpr expr) {
+    if (auto rhsSize = dyn_cast<AffineConstantExpr>(expr.getRHS());
+        isa<AffineDimExpr>(expr.getLHS())) {
+      stride = rhsSize.getValue();
+    } else if (auto lhsSize = dyn_cast<AffineConstantExpr>(expr.getLHS());
+               isa<AffineDimExpr>(expr.getRHS())) {
+      stride = lhsSize.getValue();
+    }
+  }
+};
+
+/// Utility to clean up the DMA users after loop subsumption + hoisting. This
+/// will hoist `amdaie.npu.dma_cpy_nd`'s users like `npu.dma_wait` as well.
+LogicalResult moveUsersToHoistedDMAScope(Operation *parentOp) {
+  IRRewriter rewriter(parentOp->getContext());
+  // Move `amdaie.npu.dma_wait` operation after the parent op in the same block
+  // as the input `amdaie.npu.dma_cpy_nd` operation. This parent op will
+  // typically be a loop out of which the DMA operation has been hoisted. Moving
+  // the wait operation after this loop is important to avoid a deadlock with
+  // whatever operations are still remaining inside the loop's scope.
+  WalkResult res = parentOp->walk([&](AMDAIE::NpuDmaWaitOp npuDmaWaitOp) {
+    Operation *dmaOp = npuDmaWaitOp.getDma().getDefiningOp();
+    Operation *ancestorInSameBlock =
+        getAncestorInBlock(npuDmaWaitOp, dmaOp->getBlock());
+    if (!ancestorInSameBlock) {
+      npuDmaWaitOp->emitOpError(
+          "doesn't have an ancestor in the same scope as the source DMA op");
+      return WalkResult::interrupt();
+    }
+    rewriter.moveOpAfter(npuDmaWaitOp, ancestorInSameBlock);
+    return WalkResult::advance();
+  });
+  if (res.wasInterrupted()) return failure();
+  return success();
+}
+
+class SubsumeLoopIntoDMA
+    : public OpInterfaceRewritePattern<AMDAIE::DoublyStridedOpInterface> {
+  using OpInterfaceRewritePattern::OpInterfaceRewritePattern;
+
+  /// Utility to add a loop iteration to an offsets/sizes/strides access
+  /// pattern.
+  LogicalResult addIterationToAccessPattern(
+      RewriterBase &rewriter, int64_t lowerBound, int64_t upperBound,
+      int64_t step, const DenseSet<Value> &inductionValues,
+      SmallVector<OpFoldResult> &newOffsets,
+      SmallVector<OpFoldResult> &newSizes,
+      SmallVector<OpFoldResult> &newStrides) const {
+    SmallVector<OpFoldResult> insertOffsets;
+    SmallVector<OpFoldResult> insertSizes;
+    SmallVector<OpFoldResult> insertStrides;
+    for (auto &&[i, offset] : llvm::enumerate(newOffsets)) {
+      Value offsetValue = getValueOrCreateConstantIndexOp(
+          rewriter, rewriter.getUnknownLoc(), offset);
+      if (inductionValues.contains(offsetValue)) {
+        // Initialize the offsetStride to 1. This handles the case where an
+        // induction variable is directly used as an offset inside a strided
+        // operation.
+        int64_t offsetStride = 1;
+        // If the offset value is determined by an affine expression, retrieve
+        // the affine expression's stride.multiplier and calculate the actual
+        // offset stride.
+        if (offsetValue.getDefiningOp() &&
+            isa<affine::AffineApplyOp>(offsetValue.getDefiningOp())) {
+          auto applyOp =
+              cast<affine::AffineApplyOp>(offsetValue.getDefiningOp());
+          // Retrieve the stride from the affine map using an affine expression
+          // visitor. This is the place where invalid maps are filtered out.
+          // Invalid cases will have `retriever.stride == nullopt` after
+          // visiting.
+          AffineMap affineMap = applyOp.getAffineMap();
+          RetrieveStrideSize retriever;
+          retriever.visit(affineMap.getResult(0));
+          if (!retriever.stride) return failure();
+          offsetStride *= retriever.stride.value();
+        }
+
+        // Multiplying by step size handles the non-normalized case.
+        int64_t stride =
+            getConstantIntValue(newStrides[i]).value() * offsetStride * step;
+
+        newOffsets[i] = getAsIndexOpFoldResult(rewriter.getContext(),
+                                               lowerBound * offsetStride);
+        insertOffsets.push_back(
+            getAsIndexOpFoldResult(rewriter.getContext(), 0));
+
+        // The step size is equal to the the number of iterations
+        // (ceilDiv(upperBound - lowerBound, step))
+        int64_t diff = upperBound - lowerBound;
+        assert(diff > 0 &&
+               "expected positive difference between upper bound and lower "
+               "bound");
+        assert(step > 0 && "expected positive step");
+        int64_t newSize = 1 + ((diff - 1) / step);
+        insertSizes.push_back(
+            getAsIndexOpFoldResult(rewriter.getContext(), newSize));
+
+        insertStrides.push_back(
+            getAsIndexOpFoldResult(rewriter.getContext(), stride));
+      }
+    }
+    newOffsets.insert(newOffsets.begin(), insertOffsets.begin(),
+                      insertOffsets.end());
+    newSizes.insert(newSizes.begin(), insertSizes.begin(), insertSizes.end());
+    newStrides.insert(newStrides.begin(), insertStrides.begin(),
+                      insertStrides.end());
+    return success();
+  }
+
+  /// Rewrite function for a doubly strided operation with any loop-like parent
+  /// operation.
+  LogicalResult rewriteWithLoopLikeOpParent(
+      AMDAIE::DoublyStridedOpInterface op, PatternRewriter &rewriter,
+      size_t sourceMaxNbDims, size_t targetMaxNbDims,
+      const SmallVector<int64_t> &lowerBounds,
+      const SmallVector<int64_t> &upperBounds,
+      const SmallVector<int64_t> &steps,
+      const SmallVector<DenseSet<Value>> &inductionValues,
+      const DenseSet<Value> &allInductionValues) const {
+    auto loopOp = dyn_cast<LoopLikeOpInterface>(op->getParentOp());
+    if (!loopOp) return failure();
+
+    // Initialize new access pattern offsets/sizes/strides with current values.
+    SmallVector<OpFoldResult> newSourceOffsets = op.getSourceMixedOffsets();
+    SmallVector<OpFoldResult> newSourceSizes = op.getSourceMixedSizes();
+    SmallVector<OpFoldResult> newSourceStrides = op.getSourceMixedStrides();
+    SmallVector<OpFoldResult> newTargetOffsets = op.getTargetMixedOffsets();
+    SmallVector<OpFoldResult> newTargetSizes = op.getTargetMixedSizes();
+    SmallVector<OpFoldResult> newTargetStrides = op.getTargetMixedStrides();
+
+    // Use source/target maxNbDims to check whether there are sufficient source
+    // and target dimensions. Otherwise, abort.
+    auto verifyNbDimsNeeded = [&](const SmallVector<Value> &dynamicOffsets,
+                                  size_t nbOffsets,
+                                  size_t maxNbDims) -> LogicalResult {
+      size_t counter = 0;
+      for (Value offset : dynamicOffsets)
+        if (allInductionValues.contains(offset)) counter++;
+      if (nbOffsets + counter > maxNbDims) return failure();
+      return success();
+    };
+    SmallVector<Value> dynamicSourceOffsets = op.getSourceOffsets();
+    SmallVector<Value> dynamicTargetOffsets = op.getTargetOffsets();
+    if (failed(verifyNbDimsNeeded(dynamicSourceOffsets, newSourceOffsets.size(),
+                                  sourceMaxNbDims)))
+      return failure();
+    if (failed(verifyNbDimsNeeded(dynamicTargetOffsets, newTargetOffsets.size(),
+                                  targetMaxNbDims)))
+      return failure();
+
+    // Add the loop iterations to the DMA access patterns.
+    for (auto &&[lb, ub, step, iterationIvValues] : llvm::reverse(
+             llvm::zip(lowerBounds, upperBounds, steps, inductionValues))) {
+      // Add loop iteration to the access pattern on the source side.
+      if (failed(addIterationToAccessPattern(
+              rewriter, lb, ub, step, iterationIvValues, newSourceOffsets,
+              newSourceSizes, newSourceStrides))) {
+        return failure();
+      }
+      // Add loop iteration to the access pattern on the target side.
+      if (failed(addIterationToAccessPattern(
+              rewriter, lb, ub, step, iterationIvValues, newTargetOffsets,
+              newTargetSizes, newTargetStrides))) {
+        return failure();
+      }
+    }
+
+    assert(newSourceOffsets.size() == newSourceSizes.size() &&
+           "expected same number of source offsets and sizes");
+    assert(newSourceOffsets.size() == newSourceStrides.size() &&
+           "expected same number of source offsets and strides");
+    assert(newTargetOffsets.size() == newTargetSizes.size() &&
+           "expected same number of target offsets and sizes");
+    assert(newTargetOffsets.size() == newTargetStrides.size() &&
+           "expected same number of target offsets and strides");
+
+    // Create new doubly strided operation with the updated access pattern and
+    // move it before the loop.
+    rewriter.setInsertionPoint(loopOp);
+    auto newDoublyStridedOp = op.createDoublyStridedOp(
+        rewriter, newTargetOffsets, newTargetSizes, newTargetStrides,
+        newSourceOffsets, newSourceSizes, newSourceStrides);
+    rewriter.replaceOp(op, newDoublyStridedOp.getOperation());
+    return success();
+  }
+
+  /// Main rewrite function for a doubly strided operation with a `scf.for`
+  /// parent operation. Only handle a loop induction variable with an
+  /// optional `affine.apply` user for now.
+  LogicalResult rewriteWithForOpParent(AMDAIE::DoublyStridedOpInterface op,
+                                       PatternRewriter &rewriter,
+                                       size_t sourceMaxNbDims,
+                                       size_t targetMaxNbDims) const {
+    auto forOp = dyn_cast<scf::ForOp>(op->getParentOp());
+    if (!forOp) return failure();
+
+    // Dynamic bounds or step are not supported.
+    std::optional<int64_t> lowerBound =
+        getConstantIntValue(forOp.getLowerBound());
+    std::optional<int64_t> upperBound =
+        getConstantIntValue(forOp.getUpperBound());
+    std::optional<int64_t> step = getConstantIntValue(forOp.getStep());
+    if (!lowerBound || !upperBound || !step) return failure();
+
+    // Only handle loop induction variable with an optional `affine.apply` user
+    // for now.
+    Value iv = forOp.getInductionVar();
+    DenseSet<Value> curIvValues = {iv};
+    for (OpOperand &use : iv.getUses()) {
+      if (!use.getOwner()) continue;
+      if (auto userApplyOp = dyn_cast<affine::AffineApplyOp>(use.getOwner())) {
+        curIvValues.insert(userApplyOp.getResult());
+      }
+    }
+    if (!llvm::any_of(op->getOperands(), [&](Value operand) {
+          return curIvValues.contains(operand);
+        })) {
+      return failure();
+    }
+
+    SmallVector<int64_t> lowerBounds = {lowerBound.value()};
+    SmallVector<int64_t> upperBounds = {upperBound.value()};
+    SmallVector<int64_t> steps = {step.value()};
+    SmallVector<DenseSet<Value>> inductionValues = {curIvValues};
+    return rewriteWithLoopLikeOpParent(
+        op, rewriter, sourceMaxNbDims, targetMaxNbDims, lowerBounds,
+        upperBounds, steps, inductionValues, curIvValues);
+  }
+
+  /// Main rewrite function for a doubly strided operation with a `scf.forall`
+  /// parent operation. Only handle loop induction variables with an
+  /// optional `affine.apply` user for now.
+  LogicalResult rewriteWithForallOpParent(AMDAIE::DoublyStridedOpInterface op,
+                                          PatternRewriter &rewriter,
+                                          size_t sourceMaxNbDims,
+                                          size_t targetMaxNbDims) const {
+    auto forallOp = dyn_cast<scf::ForallOp>(op->getParentOp());
+    if (!forallOp) return failure();
+
+    // Dynamic bounds or step are not supported.
+    std::optional<SmallVector<int64_t>> lowerBounds =
+        getConstantIntValues(forallOp.getMixedLowerBound());
+    std::optional<SmallVector<int64_t>> upperBounds =
+        getConstantIntValues(forallOp.getMixedUpperBound());
+    std::optional<SmallVector<int64_t>> steps =
+        getConstantIntValues(forallOp.getMixedStep());
+    if (!lowerBounds || !upperBounds || !steps) return failure();
+
+    // A set of all induction variables and optional `affine.apply` user values
+    // for easy verification whether any of the induction variables or
+    // `affine.apply` values are being used.
+    DenseSet<Value> allInductionValues;
+    // A vector of all induction varilable dependent values for each induction
+    // var. Includes the induction variable itself and any `affine.apply` users.
+    SmallVector<DenseSet<Value>> inductionValues;
+    for (Value iv : forallOp.getInductionVars()) {
+      DenseSet<Value> curIvValues = {iv};
+      allInductionValues.insert(iv);
+      for (Operation *userOp : iv.getUsers()) {
+        if (auto userApplyOp = dyn_cast<affine::AffineApplyOp>(userOp)) {
+          curIvValues.insert(userApplyOp.getResult());
+          allInductionValues.insert(userApplyOp.getResult());
+        }
+      }
+      inductionValues.push_back(curIvValues);
+    }
+    // Return early if the strided operation doesn't use any of the
+    // induction variable dependent values.
+    if (!llvm::any_of(op->getOperands(), [&](Value operand) {
+          return allInductionValues.contains(operand);
+        })) {
+      return failure();
+    }
+    return rewriteWithLoopLikeOpParent(op, rewriter, sourceMaxNbDims,
+                                       targetMaxNbDims, lowerBounds.value(),
+                                       upperBounds.value(), steps.value(),
+                                       inductionValues, allInductionValues);
+  }
+
+  LogicalResult matchAndRewrite(AMDAIE::DoublyStridedOpInterface op,
+                                PatternRewriter &rewriter) const override {
+    // Depending on the DMA being targetted, there can be a different number of
+    // max dimensions supported by the hardware. Consider the different cases
+    // for Shim, MemTile and core DMAs:
+    // - Shim DMAs: As the shim DMA typically isn't synchronized with other DMAs
+    //   (through semaphore locks), the inter-iteration access pattern is
+    //   typically used as an additional intra-iteration access pattern,
+    //   resulting in 4 DMA dimensions which can be used to address global
+    //   memory data.
+    // - As the MemTile DMAs are typically synchronized with other DMAs for
+    //   stream-through, double-buffering purposes, the inter-iteration can't
+    //   typically be used in the same way as the intra-iteration dimensions.
+    //   Therefore, for now, only the intra-iteration dimensions can be used for
+    //   DMA access patterns.
+    // - Core DMAs: As the core DMAs are typically synchronized with the core
+    //   processor for data access purposes (read/write), the inter-iteration
+    //   can't typically be used in the same way as the intra-iteration
+    //   dimensions. Therefore, for now, only the intra-iteration dimensions can
+    //   be used for DMA access patterns.
+    size_t sourceMaxNbDims{0};
+    size_t targetMaxNbDims{0};
+    if (auto npuDmaOp = dyn_cast<AMDAIE::NpuDmaCpyNdOp>(op.getOperation())) {
+      uint64_t sourceMemspaceInt = npuDmaOp.getSourceMemorySpaceAsUInt();
+      uint64_t targetMemspaceInt = npuDmaOp.getTargetMemorySpaceAsUInt();
+      if (sourceMemspaceInt == 0) {
+        sourceMaxNbDims = kAMDAIEShimDmaNbIntraDims + kAMDAIEShimDmaNbInterDims;
+      } else if (sourceMemspaceInt == 1) {
+        sourceMaxNbDims = kAMDAIEMemTileDmaNbIntraDims;
+      } else if (sourceMemspaceInt == 2) {
+        sourceMaxNbDims = kAMDAIECoreDmaNbIntraDims;
+      }
+      if (targetMemspaceInt == 0) {
+        targetMaxNbDims = kAMDAIEShimDmaNbIntraDims + kAMDAIEShimDmaNbInterDims;
+      } else if (targetMemspaceInt == 1) {
+        targetMaxNbDims = kAMDAIEMemTileDmaNbIntraDims;
+      } else if (targetMemspaceInt == 2) {
+        targetMaxNbDims = kAMDAIECoreDmaNbIntraDims;
+      }
+
+      // Check that the DMA this `amdaie.npu.dma_cpy_nd` operation is operating
+      // on, is not being touched within the same scope. Otherwise, the rewrite
+      // is not be valid in general as it would be changing the temporal usage
+      // of the source DMA.
+      Operation *parentOp = op->getParentOp();
+      if (!parentOp) return failure();
+      Value dma = npuDmaOp.getDma();
+      for (Operation *userOp : dma.getUsers()) {
+        if (userOp != op.getOperation() && parentOp->isProperAncestor(userOp)) {
+          return failure();
+        }
+      }
+    } else {
+      return failure();
+    }
+
+    if (isa<scf::ForOp>(op->getParentOp())) {
+      return rewriteWithForOpParent(op, rewriter, sourceMaxNbDims,
+                                    targetMaxNbDims);
+    } else if (isa<scf::ForallOp>(op->getParentOp())) {
+      return rewriteWithForallOpParent(op, rewriter, sourceMaxNbDims,
+                                       targetMaxNbDims);
+    } else {
+      return failure();
+    }
+  }
+};
+
+class AMDAIEDmaLoopSubsumptionPass
+    : public impl::AMDAIEDmaLoopSubsumptionBase<AMDAIEDmaLoopSubsumptionPass> {
+ public:
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AMDAIEDialect>();
+  }
+
+  AMDAIEDmaLoopSubsumptionPass() = default;
+  AMDAIEDmaLoopSubsumptionPass(const AMDAIEDmaLoopSubsumptionPass &pass){};
+  void runOnOperation() override;
+};
+
+void AMDAIEDmaLoopSubsumptionPass::runOnOperation() {
+  Operation *parentOp = getOperation();
+  MLIRContext *context = &getContext();
+  RewritePatternSet patterns(context);
+  patterns.insert<SubsumeLoopIntoDMA>(context);
+  if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) {
+    parentOp->emitOpError("failed to subsume some loops into DMA operations");
+    return signalPassFailure();
+  }
+
+  if (failed(moveUsersToHoistedDMAScope(parentOp))) {
+    parentOp->emitOpError(
+        "failed to move DMA users to correct scope after loop subsumption");
+    return signalPassFailure();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<Pass> createAMDAIEDmaLoopSubsumptionPass() {
+  return std::make_unique<AMDAIEDmaLoopSubsumptionPass>();
+}
+
+}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
index 189ea33b3..48015859d 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
@@ -55,6 +55,7 @@ iree_cc_library(
     "AMDAIECreateLogicalObjectFifoLink.cpp"
     "AMDAIECreateReferenceToAllocation.cpp"
     "AMDAIEDistributeCoresAndObjectFifos.cpp"
+    "AMDAIEDmaLoopSubsumption.cpp"
     "AMDAIEDmaToCircularDma.cpp"
     "AMDAIEDmaUtils.cpp"
     "AMDAIEFuseConsumerIntoLoop.cpp"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
index 4dda21cca..1b1d3276f 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
@@ -34,6 +34,7 @@ namespace mlir::iree_compiler::AMDAIE {
 #define GEN_PASS_DEF_AMDAIECREATEREFERENCETOALLOCATION
 #define GEN_PASS_DEF_AMDAIEDECOMPOSELINALGEXTPACKUNPACKTOAIR
 #define GEN_PASS_DEF_AMDAIEDISTRIBUTECORESANDOBJECTFIFOS
+#define GEN_PASS_DEF_AMDAIEDMALOOPSUBSUMPTION
 #define GEN_PASS_DEF_AMDAIEDMATOCIRCULARDMA
 #define GEN_PASS_DEF_AMDAIEFUSECONSUMERINTOLOOP
 #define GEN_PASS_DEF_AMDAIEFUSEFILLINTOFORALL
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
index a5536df39..615b412b5 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
@@ -101,6 +101,10 @@ std::unique_ptr<Pass> createAMDAIEDecomposeLinalgExtPackUnPackToAIRPass();
 /// operations and distribute the logical objectFifos.
 std::unique_ptr<Pass> createAMDAIEDistributeCoresAndObjectFifosPass();
 
+/// Create a pass to subsume loop iterations into DMA operations' access
+/// patterns.
+std::unique_ptr<Pass> createAMDAIEDmaLoopSubsumptionPass();
+
 /// Create a pass to convert dma operations to circular dma operations.
 std::unique_ptr<Pass> createAMDAIEDmaToCircularDmaPass();
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
index fad708a6e..1c2fd2faa 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
@@ -117,6 +117,12 @@ def AMDAIEDistributeCoresAndObjectFifos :
   let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEDistributeCoresAndObjectFifosPass()";
 }
 
+def AMDAIEDmaLoopSubsumption :
+  Pass<"iree-amdaie-dma-loop-subsumption"> {
+  let summary = "Subsume loop iterations into DMA operations' access patterns.";
+  let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEDmaLoopSubsumptionPass()";
+}
+
 def AMDAIEDmaToCircularDma :
   Pass<"iree-amdaie-dma-to-circular-dma"> {
   let summary = "Convert dma operations to circular dma operations.";
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
index 29e00d47b..ca0ada6b3 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
@@ -22,6 +22,7 @@ iree_lit_test_suite(
     "create_reference_to_allocation.mlir"
     "disable_vectorization.mlir"
     "distribute_cores_and_objectfifos.mlir"
+    "dma_loop_subsumption.mlir"
     "dma_to_circular_dma.mlir"
     "fuse_consumer_into_loop_scf_for.mlir"
     "fuse_consumer_into_loop_scf_forall.mlir"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir
new file mode 100644
index 000000000..a681569a3
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir
@@ -0,0 +1,850 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-amdaie-dma-loop-subsumption,canonicalize))" --split-input-file --verify-diagnostics %s | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// Sanity checks for cases where no modification should happen.
+//===----------------------------------------------------------------------===//
+
+// Sanity check: ensure no modification in case of no loop depedency
+// CHECK-LABEL: @npu_dma_cpy_nd_without_loop_dependency
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.forall (%{{.+}}, %{{.+}}) in (2, 2)
+// CHECK:           scf.for %{{.+}} = %[[C0]] to %[[C6]] step %[[C1]]
+// CHECK:             %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0] [1, 8, 16] [128, 16, 1], [] [] [])
+// CHECK:             amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @npu_dma_cpy_nd_without_loop_dependency(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.forall (%arg2, %arg3) in (2, 2) {
+        scf.for %arg4 = %c0 to %c6 step %c1 {
+          %1 = affine.apply #map(%arg4)
+          %2 = amdaie.npu.dma_cpy_nd %0([0, 0, 0] [1, 8, 16] [128, 16, 1], [] [] [])
+          amdaie.npu.dma_wait(%2, S2MM)
+        }
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// Ensure no modification in case of a dynamic offset not originating from an induction variable.
+// CHECK-LABEL: @dynamic_non_induction_var_offset
+// CHECK-SAME:  %{{.+}}: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %{{.+}}: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>, %[[ARG:.+]]: index
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.for %{{.+}} = %[[C0]] to %[[C6]] step %[[C1]]
+// CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[ARG]]] [16] [1], [] [] [])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+func.func @dynamic_non_induction_var_offset(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>, %arg2: index) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg3 = %c0 to %c6 step %c1 {
+        %2 = amdaie.npu.dma_cpy_nd %0([%arg2] [16] [1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// Ensure no modification in case of a invalid affine expressions.
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)>
+// CHECK:       #[[$MAP1:.+]] = affine_map<(d0) -> (d0 * 16 + 3)>
+// CHECK:       #[[$MAP2:.+]] = affine_map<(d0) -> (d0 + 3)>
+// CHECK:       #[[$MAP3:.+]] = affine_map<(d0) -> (d0 * 16 + 48)>
+// CHECK-LABEL: @invalid_affine_expr
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.for %[[ARG2:.+]] = %[[C1]] to %[[C6]] step %[[C2]]
+// CHECK:           %[[APPLY1:.+]] = affine.apply #[[$MAP]](%[[ARG2]])
+// CHECK:           %[[APPLY2:.+]] = affine.apply #[[$MAP1]](%[[ARG2]])
+// CHECK:           amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[APPLY2]]] [16] [1], [%[[APPLY1]]] [16] [1])
+// CHECK:           %[[APPLY3:.+]] = affine.apply #[[$MAP2]](%[[ARG2]])
+// CHECK:           amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[APPLY3]]] [16] [1], [] [] [])
+// CHECK:           %[[APPLY4:.+]] = affine.apply #[[$MAP3]](%[[ARG2]])
+// CHECK:           amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[APPLY4]]] [16] [1], [] [] [])
+#map = affine_map<(d0) -> (d0 * 16)>
+#map1 = affine_map<(d0) -> (d0 * 16 + 3)>
+#map2 = affine_map<(d0) -> (d0 + 3)>
+#map3 = affine_map<(d0) -> ((d0 + 3) * 16)>
+func.func @invalid_affine_expr(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c1 to %c6 step %c2 {
+        %1 = affine.apply #map(%arg2)
+        %2 = affine.apply #map1(%arg2)
+        %3 = amdaie.npu.dma_cpy_nd %0([%2] [16] [1], [%1] [16] [1])
+        %4 = affine.apply #map2(%arg2)
+        %5 = amdaie.npu.dma_cpy_nd %0([%4] [16] [1], [] [] [])
+        %6 = affine.apply #map3(%arg2)
+        %7 = amdaie.npu.dma_cpy_nd %0([%6] [16] [1], [] [] [])
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// Ensure no modification in case of too many dimensions, i.e. 4 existing
+// dimensions in the case of an `amdaie.npu.dma_cpy_nd` with target on L3.
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)>
+// CHECK-LABEL: @npu_dma_cpy_nd_too_many_dims_target
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]]
+// CHECK:           %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG2]])
+// CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0, %[[APPLY]]] [1, 1, 8, 16] [128, 128, 16, 1], [] [] [])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @npu_dma_cpy_nd_too_many_dims_target(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %1 = affine.apply #map(%arg2)
+        %2 = amdaie.npu.dma_cpy_nd %0([0, 0, 0, %1] [1, 1, 8, 16] [128, 128, 16, 1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// Ensure no modification in case of too many dimensions, i.e. 4 existing
+// dimensions in the case of an `amdaie.npu.dma_cpy_nd` with source on L3.
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)>
+// CHECK-LABEL: @npu_dma_cpy_nd_too_many_dims_source
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]]
+// CHECK:           %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG2]])
+// CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0, %[[APPLY]]] [1, 1, 8, 16] [128, 128, 16, 1])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @npu_dma_cpy_nd_too_many_dims_source(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %1 = affine.apply #map(%arg2)
+        %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, 0, %1] [1, 1, 8, 16] [128, 128, 16, 1])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// Ensure no modification in case of too many dimensions, i.e. 4 existing
+// dimensions in the case of an `amdaie.npu.dma_cpy_nd` with target on L2.
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)>
+// CHECK-LABEL: @npu_dma_cpy_nd_too_many_dims_target_on_l2
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]]
+// CHECK:           %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG2]])
+// CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0, %[[APPLY]]] [1, 1, 8, 16] [128, 128, 16, 1], [] [] [])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @npu_dma_cpy_nd_too_many_dims_target_on_l2(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %1 = affine.apply #map(%arg2)
+        %2 = amdaie.npu.dma_cpy_nd %0([0, 0, 0, %1] [1, 1, 8, 16] [128, 128, 16, 1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// Ensure no modification in case of too many dimensions, i.e. 4 existing
+// dimensions in the case of an `amdaie.npu.dma_cpy_nd` with source on L2.
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)>
+// CHECK-LABEL: @npu_dma_cpy_nd_too_many_dims_source_on_l2
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]]
+// CHECK:           %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG2]])
+// CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0, %[[APPLY]]] [1, 1, 8, 16] [128, 128, 16, 1])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @npu_dma_cpy_nd_too_many_dims_source_on_l2(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %1 = affine.apply #map(%arg2)
+        %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, 0, %1] [1, 1, 8, 16] [128, 128, 16, 1])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// Ensure no modification in case of too many dimensions, i.e. 3 existing
+// dimensions in the case of an `amdaie.npu.dma_cpy_nd` with target on L1.
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)>
+// CHECK-LABEL: @npu_dma_cpy_nd_too_many_dims_target_on_l1
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]]
+// CHECK:           %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG2]])
+// CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, %[[APPLY]]] [1, 8, 16] [128, 16, 1], [] [] [])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @npu_dma_cpy_nd_too_many_dims_target_on_l1(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %1 = affine.apply #map(%arg2)
+        %2 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// Ensure no modification in case of too many dimensions, i.e. 3 existing
+// dimensions in the case of an `amdaie.npu.dma_cpy_nd` with source on L1.
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)>
+// CHECK-LABEL: @npu_dma_cpy_nd_too_many_dims_source_on_l1
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]]
+// CHECK:           %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG2]])
+// CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, %[[APPLY]]] [1, 8, 16] [128, 16, 1])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @npu_dma_cpy_nd_too_many_dims_source_on_l1(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 2>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 2>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %1 = affine.apply #map(%arg2)
+        %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, %1] [1, 8, 16] [128, 16, 1])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// Ensure no modification in case of multiple npu.dma_cpy_nd users with the same source in the same scope.
+// CHECK-LABEL: @for_with_multiple_npu_dma_cpy_nd_same_source
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]]
+// CHECK:           %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG2]])
+// CHECK:           %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, %[[APPLY]]] [1, 8, 16] [128, 16, 1], [] [] [])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_0]], S2MM)
+// CHECK:           %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, %[[APPLY]]] [1, 8, 16] [128, 16, 1], [] [] [])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_1]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @for_with_multiple_npu_dma_cpy_nd_same_source(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %1 = affine.apply #map(%arg2)
+        %2 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+        %3 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] [])
+        amdaie.npu.dma_wait(%3, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// Ensure no modification in case of multiple npu.dma_cpy_nd users with the same source in the same scope.
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)>
+// CHECK-LABEL: @forall_with_multiple_npu_dma_cpy_nd_same_source
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.forall (%[[ARG2:.+]], %[[ARG3:.+]]) in (2, 6)
+// CHECK:           %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG3]])
+// CHECK:           %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, %[[APPLY]]] [1, 8, 16] [128, 16, 1], [] [] [])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_0]], S2MM)
+// CHECK:           %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, %[[APPLY]]] [1, 8, 16] [128, 16, 1], [] [] [])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_1]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @forall_with_multiple_npu_dma_cpy_nd_same_source(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.forall (%arg2, %arg3) in (2, 6) {
+        %1 = affine.apply #map(%arg3)
+        %2 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+        %3 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] [])
+        amdaie.npu.dma_wait(%3, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+//===----------------------------------------------------------------------===//
+// Checks for dependencies via `affine.apply` on both source and target sides.
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// Check that loop subsumption happens in case of an identity affine expression.
+// CHECK-LABEL: @identity_affine_expr
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:     scf.for
+// CHECK:         amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]]] [%[[C6]], %[[C16]]] [%[[C1]], %[[C1]]], [] [] [])
+#map = affine_map<(d0) -> (d0)>
+func.func @identity_affine_expr(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %1 = affine.apply #map(%arg2)
+        %3 = amdaie.npu.dma_cpy_nd %0([%1] [16] [1], [] [] [])
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @for_dependency_on_target
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[C128:.+]] = arith.constant 128 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.for
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C1]], %[[C8]], %[[C16]]] [%[[C16]], %[[C128]], %[[C16]], %[[C1]]], [] [] [])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @for_dependency_on_target(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %1 = affine.apply #map(%arg2)
+        %2 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @forall_dependency_on_target
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[C128:.+]] = arith.constant 128 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.forall
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C1]], %[[C8]], %[[C16]]] [%[[C16]], %[[C128]], %[[C16]], %[[C1]]], [] [] [])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (16 * d0)>
+func.func @forall_dependency_on_target(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.forall (%arg2, %arg3) in (2, 6) {
+        %1 = affine.apply #map(%arg3)
+        %2 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @for_dependency_on_source
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[C128:.+]] = arith.constant 128 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.for
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C1]], %[[C8]], %[[C16]]] [%[[C16]], %[[C128]], %[[C16]], %[[C1]]])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @for_dependency_on_source(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %1 = affine.apply #map(%arg2)
+        %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, %1] [1, 8, 16] [128, 16, 1])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @forall_dependency_on_source
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[C128:.+]] = arith.constant 128 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.forall
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C1]], %[[C8]], %[[C16]]] [%[[C16]], %[[C128]], %[[C16]], %[[C1]]])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @forall_dependency_on_source(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.forall (%arg2, %arg3) in (2, 6) {
+        %1 = affine.apply #map(%arg3)
+        %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, %1] [1, 8, 16] [128, 16, 1])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// Check with multiple `affine.apply` usages in a `amdaie.npu.dma_cpy_nd` operation.
+// CHECK-LABEL: @multiple_for_dependencies
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[C256:.+]] = arith.constant 256 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.for
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C6]], %[[C8]], %[[C16]]] [%[[C256]], %[[C16]], %[[C16]], %[[C1]]], [] [] [])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @multiple_for_dependencies(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %1 = affine.apply #map(%arg2)
+        %2 = amdaie.npu.dma_cpy_nd %0([%1, %1] [8, 16] [16, 1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @multiple_forall_dependencies
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[C512:.+]] = arith.constant 512 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.forall
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C6]], %[[C8]], %[[C16]]] [%[[C16]], %[[C512]], %[[C16]], %[[C1]]], [] [] [])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+#map1 = affine_map<(d0) -> (d0 * 32)>
+func.func @multiple_forall_dependencies(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.forall (%arg2, %arg3) in (2, 6) {
+        %1 = affine.apply #map(%arg2)
+        %2 = affine.apply #map1(%arg3)
+        %3 = amdaie.npu.dma_cpy_nd %0([%2, %1] [8, 16] [16, 1], [] [] [])
+        amdaie.npu.dma_wait(%3, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @for_with_affine_non_normalized
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[C32:.+]] = arith.constant 32 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.for
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C16]]] [%[[C3]], %[[C16]]] [%[[C32]], %[[C1]]], [] [] [])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @for_with_affine_non_normalized(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c1 to %c6 step %c2 {
+        %1 = affine.apply #map(%arg2)
+        %2 = amdaie.npu.dma_cpy_nd %0([%1] [16] [1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @forall_with_affine_non_normalized
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:   %[[C5:.+]] = arith.constant 5 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[C32:.+]] = arith.constant 32 : index
+// CHECK-DAG:   %[[C48:.+]] = arith.constant 48 : index
+// CHECK-DAG:   %[[C1024:.+]] = arith.constant 1024 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.forall
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C32]], %[[C32]]] [%[[C5]], %[[C4]], %[[C8]], %[[C16]]] [%[[C48]], %[[C1024]], %[[C16]], %[[C1]]], [] [] [])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+#map1 = affine_map<(d0) -> (d0 * 32)>
+func.func @forall_with_affine_non_normalized(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.forall (%arg2, %arg3) = (2, 1) to (17, 8) step (3, 2) {
+        %1 = affine.apply #map(%arg2)
+        %2 = affine.apply #map1(%arg3)
+        %3 = amdaie.npu.dma_cpy_nd %0([%2, %1] [8, 16] [16, 1], [] [] [])
+        amdaie.npu.dma_wait(%3, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+//===----------------------------------------------------------------------===//
+// Checks for dependencies on nested loops
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: @nested_dependencies
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[C32:.+]] = arith.constant 32 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.forall
+// CHECK-NOT:   scf.for
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C1]], %[[C0]]] [%[[C6]], %[[C3]], %[[C16]], %[[C8]]] [%[[C32]], %[[C32]], %[[C16]], %[[C1]]], [] [] [])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+#map1 = affine_map<(d0) -> (d0 * 32)>
+func.func @nested_dependencies(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.forall (%arg2, %arg3) in (2, 6) {
+        %1 = affine.apply #map(%arg2)
+        %2 = affine.apply #map1(%arg3)
+        scf.for %arg4 = %c1 to %c6 step %c2 {
+          %3 = amdaie.npu.dma_cpy_nd %0([%arg4, %2] [16, 8] [16, 1], [] [] [])
+          amdaie.npu.dma_wait(%3, S2MM)
+        }
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+//===----------------------------------------------------------------------===//
+// Checks for dependencies via induction variables (no affine.apply) on both 
+// source and target sides.
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: @for_with_induction_var_normalized
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.for
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]]] [%[[C6]], %[[C16]]] [%[[C1]], %[[C1]]], [] [] [])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+func.func @for_with_induction_var_normalized(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %2 = amdaie.npu.dma_cpy_nd %0([%arg2] [16] [1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @for_with_induction_var_non_normalized
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.for
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C1]]] [%[[C3]], %[[C16]]] [%[[C2]], %[[C1]]], [] [] [])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+func.func @for_with_induction_var_non_normalized(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c1 to %c6 step %c2 {
+        %2 = amdaie.npu.dma_cpy_nd %0([%arg2] [16] [1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @forall_with_induction_var_normalized
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[C17:.+]] = arith.constant 17 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.forall
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C17]], %[[C8]], %[[C8]], %[[C16]]] [%[[C1]], %[[C16]], %[[C16]], %[[C1]]], [] [] [])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+func.func @forall_with_induction_var_normalized(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.forall (%arg2, %arg3) in (17, 8) {
+        %3 = amdaie.npu.dma_cpy_nd %0([%arg3, %arg2] [8, 16] [16, 1], [] [] [])
+        amdaie.npu.dma_wait(%3, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @forall_with_induction_var_non_normalized
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
+// CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:   %[[C5:.+]] = arith.constant 5 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[C32:.+]] = arith.constant 32 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.forall
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C1]], %[[C2]]] [%[[C5]], %[[C4]], %[[C8]], %[[C16]]] [%[[C3]], %[[C32]], %[[C16]], %[[C1]]], [] [] [])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+func.func @forall_with_induction_var_non_normalized(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.forall (%arg2, %arg3) = (2, 1) to (17, 8) step (3, 2) {
+        %3 = amdaie.npu.dma_cpy_nd %0([%arg3, %arg2] [8, 16] [16, 1], [] [] [])
+        amdaie.npu.dma_wait(%3, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
diff --git a/tests/samples/matmul_peeled_objectfifo.mlir b/tests/samples/matmul_peeled_objectfifo.mlir
index f0e33f711..70fd8b66a 100644
--- a/tests/samples/matmul_peeled_objectfifo.mlir
+++ b/tests/samples/matmul_peeled_objectfifo.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-pack-to-dma,air-copy-to-dma,iree-amdaie-air-dma-to-amdaie-dma,iree-amdaie-insert-cores,cse,iree-amdaie-localize-logicalobjectfifo,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize,iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-access-to-acquire-release,cse,canonicalize,iree-amdaie-controlcode-loop-unroll,cse,canonicalize,iree-amdaie-create-logical-objectfifo-link,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-lower-to-aie,canonicalize)" --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-pack-to-dma,air-copy-to-dma,iree-amdaie-air-dma-to-amdaie-dma,iree-amdaie-insert-cores,cse,iree-amdaie-localize-logicalobjectfifo,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize,iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-access-to-acquire-release,cse,canonicalize,iree-amdaie-dma-loop-subsumption,cse,canonicalize,iree-amdaie-controlcode-loop-unroll,cse,canonicalize,iree-amdaie-create-logical-objectfifo-link,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-lower-to-aie,canonicalize)" --split-input-file %s | FileCheck %s
 
 // CHECK:       aie.device(npu1_4col)
 // CHECK-DAG:   %[[TILE_0_2:.+]] = aie.tile(0, 2)