diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
index 4dbf77ebb..e614580de 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
@@ -255,6 +255,47 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd",
     CircularDmaCpyNdOp getDmaCpyNdOp() {
       return dyn_cast<CircularDmaCpyNdOp>(getDma().getDefiningOp());
     }
+
+    // Return the source memref type. This is retrieved using information from
+    // the input DMA operation.
+    MemRefType getSourceMemrefType() { 
+      return cast<LogicalObjectFifoType>(getDmaCpyNdOp().getSourceType())
+          .getElementType();
+    }
+
+    // Return the source memory space as an attribute.
+    Attribute getSourceMemorySpace() {
+      return getSourceMemrefType().getMemorySpace();
+    }
+
+    // Helper method to return the source memory space as an integer. If no 
+    // memory space attribute, this indicates a global memory space and we
+    // return 0. Else cast the memory space attribute to an integer. 
+    uint64_t getSourceMemorySpaceAsUInt() {
+      Attribute memSpace = getSourceMemorySpace();
+      return memSpace ? cast<IntegerAttr>(memSpace).getInt() : 0;
+    }
+
+    // Return the target memref type. This is retrieved using information from
+    // the input DMA operation.
+    MemRefType getTargetMemrefType() { 
+      return cast<LogicalObjectFifoType>(getDmaCpyNdOp().getTargetType())
+          .getElementType();
+    }
+
+    // Return the target memory space as an attribute.
+    Attribute getTargetMemorySpace() {
+      return getTargetMemrefType().getMemorySpace();
+    }
+
+    // Helper method to return the target memory space as an integer. If no
+    // memory space attribute, this indicates a global memory space and we
+    // return 0. Else cast the memory space attribute to an integer. 
+    uint64_t getTargetMemorySpaceAsUInt() {
+      Attribute memSpace = getTargetMemorySpace();
+      return memSpace ? cast<IntegerAttr>(memSpace).getInt() : 0;
+    }
+
     // A utility to create a new doubly strided operation from this one with a
     // new set of source and target offsets, sizes and strides.
     DoublyStridedOpInterface createDoublyStridedOp(
@@ -496,7 +537,7 @@ def AMDAIE_LogicalObjectFifoFromMemrefOp
     // Else cast the memory space attribute to an integer. 
     uint64_t getMemorySpaceAsUInt() {
       Attribute memSpace = getMemorySpace();
-      return memSpace ? dyn_cast<IntegerAttr>(memSpace).getInt() : 0;
+      return memSpace ? cast<IntegerAttr>(memSpace).getInt() : 0;
     }
 
     // Return the source memref type.
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETargetModel.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETargetModel.h
new file mode 100644
index 000000000..a50c8d0f8
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETargetModel.h
@@ -0,0 +1,63 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// In absence of a complete hardware model interface, this file contains some
+// constants to describe hardware-related parameters used in transformations.
+// This is meant to be temporary.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef IREE_COMPILER_AMDAIE_TARGET_MODEL_H_
+#define IREE_COMPILER_AMDAIE_TARGET_MODEL_H_
+
+namespace mlir::iree_compiler::AMDAIE {
+
+//===----------------------------------------------------------------------===//
+//
+// DMA iteration dimensions
+//
+// DMAs support multi-dimensional addressing through buffer descriptors in two
+// ways:
+// 1. Intra-iteration access pattern. Specified via 'strides' ('steps' in buffer
+// descriptor lingo), 'sizes' ('wraps' in buffer descriptro lingo) and
+// 'padding'. When a DMA executes a buffer descriptor, it will access the data
+// (read/write) as specified by the intra-iteration access pattern.
+// 2. Inter-iteration access pattern. Specified via an iteration 'stride',
+// 'size' and 'current_iteration' ('stride' is the same as 'stepsize' and 'size'
+// is the same as 'wrap' in buffer descriptor lingo). Here, 'current_iteration'
+// keeps track of the current execution iteration of the buffer descriptor and
+// is incremented after buffer descriptor execution. the 'stride' is the offset
+// to be used for each execution of the buffer descriptor, relative to the
+// previous one. When 'iteration_current' is equal to 'size', the
+// 'iteration_current' is reset to zero.
+//
+// Although all DMAs use the same buffer descriptor format to describe the
+// execution configuration, the intra-iteration and inter-dimensions are
+// typically used for different purposes on different DMAs. See for example the
+// usage of these constants inside the DMA loop subsumption pass.
+//
+//===----------------------------------------------------------------------===//
+
+/// Shim DMAs support 3 intra-iteration dimensions + 1 inter-iteration
+/// dimension.
+static const int64_t kAMDAIEShimDmaNbIntraDims = 3;
+static const int64_t kAMDAIEShimDmaNbInterDims = 1;
+
+/// MemTile DMAs support 4 intra-iteration dimensions + 1 inter-iteration
+/// dimension.
+static const int64_t kAMDAIEMemTileDmaNbIntraDims = 4;
+static const int64_t kAMDAIEMemTileDmaNbInterDims = 1;
+
+/// Core DMAs support 3 intra-iteration dimensions + 1 inter-iteration
+/// dimension.
+static const int64_t kAMDAIECoreDmaNbIntraDims = 3;
+static const int64_t kAMDAIECoreDmaNbInterDims = 1;
+
+}  // namespace mlir::iree_compiler::AMDAIE
+
+#endif  // IREE_COMPILER_AMDAIE_TARGET_MODEL_H_
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt
index c669deaca..6cc4b4c05 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt
@@ -15,6 +15,7 @@ iree_cc_library(
     "AMDAIEAttrs.h"
     "AMDAIEDialect.h"
     "AMDAIEOps.h"
+    "AMDAIETargetModel.h"
     "AMDAIETypes.h"
   TEXTUAL_HDRS
     "AMDAIEAttrs.cpp.inc"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp
new file mode 100644
index 000000000..6c774b593
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp
@@ -0,0 +1,433 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the transformation that subsumes a loop iteration into a
+// DMA access pattern if possible. This adds an additional dimension to the
+// DMA's access pattern and hoits the DMA operation out of the loop. This
+// transformation is possible if:
+//
+// - The loop's bounds and step size are all constants.
+// - The DMA is only operated on once within the loop's scope. Otherwise,
+//   subsumbtion of the loop iteration into the DMA can change the temporal
+//   behaviour of the program.
+// - The DMA has additional available access pattern dimensions. This
+//   information is retrieved from a target hardware model.
+//
+//===----------------------------------------------------------------------===//
+
+#include "iree-amd-aie/IR/AMDAIEDialect.h"
+#include "iree-amd-aie/IR/AMDAIEOps.h"
+#include "iree-amd-aie/IR/AMDAIETargetModel.h"
+#include "iree-amd-aie/Transforms/Passes.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/AffineExprVisitor.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#define DEBUG_TYPE "iree-amdaie-dma-loop-subsumption"
+
+namespace mlir::iree_compiler::AMDAIE {
+
+namespace {
+
+/// Return an ancestor of 'op' in 'block', or nullptr if no such ancestor.
+Operation *getAncestorInBlock(Operation *op, Block *block) {
+  if (!op || !block) return nullptr;
+  auto parent = op;
+  while (parent && (parent->getBlock() != block))
+    parent = parent->getParentOp();
+  return parent;
+}
+
+/// Utility affine expression visitor to retrieve the stride from the
+/// expression.
+struct RetrieveStrideSize : public AffineExprVisitor<RetrieveStrideSize> {
+  std::optional<int64_t> stride;
+  void visitMulExpr(AffineBinaryOpExpr expr) {
+    if (auto rhsSize = dyn_cast<AffineConstantExpr>(expr.getRHS());
+        isa<AffineDimExpr>(expr.getLHS())) {
+      stride = rhsSize.getValue();
+    } else if (auto lhsSize = dyn_cast<AffineConstantExpr>(expr.getLHS());
+               isa<AffineDimExpr>(expr.getRHS())) {
+      stride = lhsSize.getValue();
+    }
+  }
+};
+
+/// Utility to clean up the DMA users after loop subsumption + hoisting. This
+/// will hoist `amdaie.npu.dma_cpy_nd`'s users like `npu.dma_wait` as well.
+LogicalResult moveUsersToHoistedDMAScope(Operation *parentOp) {
+  IRRewriter rewriter(parentOp->getContext());
+  // Move `amdaie.npu.dma_wait` operation after the parent op in the same block
+  // as the input `amdaie.npu.dma_cpy_nd` operation. This parent op will
+  // typically be a loop out of which the DMA operation has been hoisted. Moving
+  // the wait operation after this loop is important to avoid a deadlock with
+  // whatever operations are still remaining inside the loop's scope.
+  WalkResult res = parentOp->walk([&](AMDAIE::NpuDmaWaitOp npuDmaWaitOp) {
+    Operation *dmaOp = npuDmaWaitOp.getDma().getDefiningOp();
+    Operation *ancestorInSameBlock =
+        getAncestorInBlock(npuDmaWaitOp, dmaOp->getBlock());
+    if (!ancestorInSameBlock) {
+      npuDmaWaitOp->emitOpError(
+          "doesn't have an ancestor in the same scope as the source DMA op");
+      return WalkResult::interrupt();
+    }
+    rewriter.moveOpAfter(npuDmaWaitOp, ancestorInSameBlock);
+    return WalkResult::advance();
+  });
+  if (res.wasInterrupted()) return failure();
+  return success();
+}
+
+class SubsumeLoopIntoDMA
+    : public OpInterfaceRewritePattern<AMDAIE::DoublyStridedOpInterface> {
+  using OpInterfaceRewritePattern::OpInterfaceRewritePattern;
+
+  /// Utility to add a loop iteration to an offsets/sizes/strides access
+  /// pattern.
+  LogicalResult addIterationToAccessPattern(
+      RewriterBase &rewriter, int64_t lowerBound, int64_t upperBound,
+      int64_t step, const DenseSet<Value> &inductionValues,
+      SmallVector<OpFoldResult> &newOffsets,
+      SmallVector<OpFoldResult> &newSizes,
+      SmallVector<OpFoldResult> &newStrides) const {
+    SmallVector<OpFoldResult> insertOffsets;
+    SmallVector<OpFoldResult> insertSizes;
+    SmallVector<OpFoldResult> insertStrides;
+    for (auto &&[i, offset] : llvm::enumerate(newOffsets)) {
+      Value offsetValue = getValueOrCreateConstantIndexOp(
+          rewriter, rewriter.getUnknownLoc(), offset);
+      if (inductionValues.contains(offsetValue)) {
+        // Initialize the offsetStride to 1. This handles the case where an
+        // induction variable is directly used as an offset inside a strided
+        // operation.
+        int64_t offsetStride = 1;
+        // If the offset value is determined by an affine expression, retrieve
+        // the affine expression's stride.multiplier and calculate the actual
+        // offset stride.
+        if (offsetValue.getDefiningOp() &&
+            isa<affine::AffineApplyOp>(offsetValue.getDefiningOp())) {
+          auto applyOp =
+              cast<affine::AffineApplyOp>(offsetValue.getDefiningOp());
+          // Retrieve the stride from the affine map using an affine expression
+          // visitor. This is the place where invalid maps are filtered out.
+          // Invalid cases will have `retriever.stride == nullopt` after
+          // visiting.
+          AffineMap affineMap = applyOp.getAffineMap();
+          RetrieveStrideSize retriever;
+          retriever.visit(affineMap.getResult(0));
+          if (!retriever.stride) return failure();
+          offsetStride *= retriever.stride.value();
+        }
+
+        // Multiplying by step size handles the non-normalized case.
+        int64_t stride =
+            getConstantIntValue(newStrides[i]).value() * offsetStride * step;
+
+        newOffsets[i] = getAsIndexOpFoldResult(rewriter.getContext(),
+                                               lowerBound * offsetStride);
+        insertOffsets.push_back(
+            getAsIndexOpFoldResult(rewriter.getContext(), 0));
+
+        // The step size is equal to the the number of iterations
+        // (ceilDiv(upperBound - lowerBound, step))
+        int64_t diff = upperBound - lowerBound;
+        assert(diff > 0 &&
+               "expected positive difference between upper bound and lower "
+               "bound");
+        assert(step > 0 && "expected positive step");
+        int64_t newSize = 1 + ((diff - 1) / step);
+        insertSizes.push_back(
+            getAsIndexOpFoldResult(rewriter.getContext(), newSize));
+
+        insertStrides.push_back(
+            getAsIndexOpFoldResult(rewriter.getContext(), stride));
+      }
+    }
+    newOffsets.insert(newOffsets.begin(), insertOffsets.begin(),
+                      insertOffsets.end());
+    newSizes.insert(newSizes.begin(), insertSizes.begin(), insertSizes.end());
+    newStrides.insert(newStrides.begin(), insertStrides.begin(),
+                      insertStrides.end());
+    return success();
+  }
+
+  /// Rewrite function for a doubly strided operation with any loop-like parent
+  /// operation.
+  LogicalResult rewriteWithLoopLikeOpParent(
+      AMDAIE::DoublyStridedOpInterface op, PatternRewriter &rewriter,
+      size_t sourceMaxNbDims, size_t targetMaxNbDims,
+      const SmallVector<int64_t> &lowerBounds,
+      const SmallVector<int64_t> &upperBounds,
+      const SmallVector<int64_t> &steps,
+      const SmallVector<DenseSet<Value>> &inductionValues,
+      const DenseSet<Value> &allInductionValues) const {
+    auto loopOp = dyn_cast<LoopLikeOpInterface>(op->getParentOp());
+    if (!loopOp) return failure();
+
+    // Initialize new access pattern offsets/sizes/strides with current values.
+    SmallVector<OpFoldResult> newSourceOffsets = op.getSourceMixedOffsets();
+    SmallVector<OpFoldResult> newSourceSizes = op.getSourceMixedSizes();
+    SmallVector<OpFoldResult> newSourceStrides = op.getSourceMixedStrides();
+    SmallVector<OpFoldResult> newTargetOffsets = op.getTargetMixedOffsets();
+    SmallVector<OpFoldResult> newTargetSizes = op.getTargetMixedSizes();
+    SmallVector<OpFoldResult> newTargetStrides = op.getTargetMixedStrides();
+
+    // Use source/target maxNbDims to check whether there are sufficient source
+    // and target dimensions. Otherwise, abort.
+    auto verifyNbDimsNeeded = [&](const SmallVector<Value> &dynamicOffsets,
+                                  size_t nbOffsets,
+                                  size_t maxNbDims) -> LogicalResult {
+      size_t counter = 0;
+      for (Value offset : dynamicOffsets)
+        if (allInductionValues.contains(offset)) counter++;
+      if (nbOffsets + counter > maxNbDims) return failure();
+      return success();
+    };
+    SmallVector<Value> dynamicSourceOffsets = op.getSourceOffsets();
+    SmallVector<Value> dynamicTargetOffsets = op.getTargetOffsets();
+    if (failed(verifyNbDimsNeeded(dynamicSourceOffsets, newSourceOffsets.size(),
+                                  sourceMaxNbDims)))
+      return failure();
+    if (failed(verifyNbDimsNeeded(dynamicTargetOffsets, newTargetOffsets.size(),
+                                  targetMaxNbDims)))
+      return failure();
+
+    // Add the loop iterations to the DMA access patterns.
+    for (auto &&[lb, ub, step, iterationIvValues] : llvm::reverse(
+             llvm::zip(lowerBounds, upperBounds, steps, inductionValues))) {
+      // Add loop iteration to the access pattern on the source side.
+      if (failed(addIterationToAccessPattern(
+              rewriter, lb, ub, step, iterationIvValues, newSourceOffsets,
+              newSourceSizes, newSourceStrides))) {
+        return failure();
+      }
+      // Add loop iteration to the access pattern on the target side.
+      if (failed(addIterationToAccessPattern(
+              rewriter, lb, ub, step, iterationIvValues, newTargetOffsets,
+              newTargetSizes, newTargetStrides))) {
+        return failure();
+      }
+    }
+
+    assert(newSourceOffsets.size() == newSourceSizes.size() &&
+           "expected same number of source offsets and sizes");
+    assert(newSourceOffsets.size() == newSourceStrides.size() &&
+           "expected same number of source offsets and strides");
+    assert(newTargetOffsets.size() == newTargetSizes.size() &&
+           "expected same number of target offsets and sizes");
+    assert(newTargetOffsets.size() == newTargetStrides.size() &&
+           "expected same number of target offsets and strides");
+
+    // Create new doubly strided operation with the updated access pattern and
+    // move it before the loop.
+    rewriter.setInsertionPoint(loopOp);
+    auto newDoublyStridedOp = op.createDoublyStridedOp(
+        rewriter, newTargetOffsets, newTargetSizes, newTargetStrides,
+        newSourceOffsets, newSourceSizes, newSourceStrides);
+    rewriter.replaceOp(op, newDoublyStridedOp.getOperation());
+    return success();
+  }
+
+  /// Main rewrite function for a doubly strided operation with a `scf.for`
+  /// parent operation. Only handle a loop induction variable with an
+  /// optional `affine.apply` user for now.
+  LogicalResult rewriteWithForOpParent(AMDAIE::DoublyStridedOpInterface op,
+                                       PatternRewriter &rewriter,
+                                       size_t sourceMaxNbDims,
+                                       size_t targetMaxNbDims) const {
+    auto forOp = dyn_cast<scf::ForOp>(op->getParentOp());
+    if (!forOp) return failure();
+
+    // Dynamic bounds or step are not supported.
+    std::optional<int64_t> lowerBound =
+        getConstantIntValue(forOp.getLowerBound());
+    std::optional<int64_t> upperBound =
+        getConstantIntValue(forOp.getUpperBound());
+    std::optional<int64_t> step = getConstantIntValue(forOp.getStep());
+    if (!lowerBound || !upperBound || !step) return failure();
+
+    // Only handle loop induction variable with an optional `affine.apply` user
+    // for now.
+    Value iv = forOp.getInductionVar();
+    DenseSet<Value> curIvValues = {iv};
+    for (OpOperand &use : iv.getUses()) {
+      if (!use.getOwner()) continue;
+      if (auto userApplyOp = dyn_cast<affine::AffineApplyOp>(use.getOwner())) {
+        curIvValues.insert(userApplyOp.getResult());
+      }
+    }
+    if (!llvm::any_of(op->getOperands(), [&](Value operand) {
+          return curIvValues.contains(operand);
+        })) {
+      return failure();
+    }
+
+    SmallVector<int64_t> lowerBounds = {lowerBound.value()};
+    SmallVector<int64_t> upperBounds = {upperBound.value()};
+    SmallVector<int64_t> steps = {step.value()};
+    SmallVector<DenseSet<Value>> inductionValues = {curIvValues};
+    return rewriteWithLoopLikeOpParent(
+        op, rewriter, sourceMaxNbDims, targetMaxNbDims, lowerBounds,
+        upperBounds, steps, inductionValues, curIvValues);
+  }
+
+  /// Main rewrite function for a doubly strided operation with a `scf.forall`
+  /// parent operation. Only handle loop induction variables with an
+  /// optional `affine.apply` user for now.
+  LogicalResult rewriteWithForallOpParent(AMDAIE::DoublyStridedOpInterface op,
+                                          PatternRewriter &rewriter,
+                                          size_t sourceMaxNbDims,
+                                          size_t targetMaxNbDims) const {
+    auto forallOp = dyn_cast<scf::ForallOp>(op->getParentOp());
+    if (!forallOp) return failure();
+
+    // Dynamic bounds or step are not supported.
+    std::optional<SmallVector<int64_t>> lowerBounds =
+        getConstantIntValues(forallOp.getMixedLowerBound());
+    std::optional<SmallVector<int64_t>> upperBounds =
+        getConstantIntValues(forallOp.getMixedUpperBound());
+    std::optional<SmallVector<int64_t>> steps =
+        getConstantIntValues(forallOp.getMixedStep());
+    if (!lowerBounds || !upperBounds || !steps) return failure();
+
+    // A set of all induction variables and optional `affine.apply` user values
+    // for easy verification whether any of the induction variables or
+    // `affine.apply` values are being used.
+    DenseSet<Value> allInductionValues;
+    // A vector of all induction varilable dependent values for each induction
+    // var. Includes the induction variable itself and any `affine.apply` users.
+    SmallVector<DenseSet<Value>> inductionValues;
+    for (Value iv : forallOp.getInductionVars()) {
+      DenseSet<Value> curIvValues = {iv};
+      allInductionValues.insert(iv);
+      for (Operation *userOp : iv.getUsers()) {
+        if (auto userApplyOp = dyn_cast<affine::AffineApplyOp>(userOp)) {
+          curIvValues.insert(userApplyOp.getResult());
+          allInductionValues.insert(userApplyOp.getResult());
+        }
+      }
+      inductionValues.push_back(curIvValues);
+    }
+    // Return early if the strided operation doesn't use any of the
+    // induction variable dependent values.
+    if (!llvm::any_of(op->getOperands(), [&](Value operand) {
+          return allInductionValues.contains(operand);
+        })) {
+      return failure();
+    }
+    return rewriteWithLoopLikeOpParent(op, rewriter, sourceMaxNbDims,
+                                       targetMaxNbDims, lowerBounds.value(),
+                                       upperBounds.value(), steps.value(),
+                                       inductionValues, allInductionValues);
+  }
+
+  LogicalResult matchAndRewrite(AMDAIE::DoublyStridedOpInterface op,
+                                PatternRewriter &rewriter) const override {
+    // Depending on the DMA being targetted, there can be a different number of
+    // max dimensions supported by the hardware. Consider the different cases
+    // for Shim, MemTile and core DMAs:
+    // - Shim DMAs: As the shim DMA typically isn't synchronized with other DMAs
+    //   (through semaphore locks), the inter-iteration access pattern is
+    //   typically used as an additional intra-iteration access pattern,
+    //   resulting in 4 DMA dimensions which can be used to address global
+    //   memory data.
+    // - As the MemTile DMAs are typically synchronized with other DMAs for
+    //   stream-through, double-buffering purposes, the inter-iteration can't
+    //   typically be used in the same way as the intra-iteration dimensions.
+    //   Therefore, for now, only the intra-iteration dimensions can be used for
+    //   DMA access patterns.
+    // - Core DMAs: As the core DMAs are typically synchronized with the core
+    //   processor for data access purposes (read/write), the inter-iteration
+    //   can't typically be used in the same way as the intra-iteration
+    //   dimensions. Therefore, for now, only the intra-iteration dimensions can
+    //   be used for DMA access patterns.
+    size_t sourceMaxNbDims{0};
+    size_t targetMaxNbDims{0};
+    if (auto npuDmaOp = dyn_cast<AMDAIE::NpuDmaCpyNdOp>(op.getOperation())) {
+      uint64_t sourceMemspaceInt = npuDmaOp.getSourceMemorySpaceAsUInt();
+      uint64_t targetMemspaceInt = npuDmaOp.getTargetMemorySpaceAsUInt();
+      if (sourceMemspaceInt == 0) {
+        sourceMaxNbDims = kAMDAIEShimDmaNbIntraDims + kAMDAIEShimDmaNbInterDims;
+      } else if (sourceMemspaceInt == 1) {
+        sourceMaxNbDims = kAMDAIEMemTileDmaNbIntraDims;
+      } else if (sourceMemspaceInt == 2) {
+        sourceMaxNbDims = kAMDAIECoreDmaNbIntraDims;
+      }
+      if (targetMemspaceInt == 0) {
+        targetMaxNbDims = kAMDAIEShimDmaNbIntraDims + kAMDAIEShimDmaNbInterDims;
+      } else if (targetMemspaceInt == 1) {
+        targetMaxNbDims = kAMDAIEMemTileDmaNbIntraDims;
+      } else if (targetMemspaceInt == 2) {
+        targetMaxNbDims = kAMDAIECoreDmaNbIntraDims;
+      }
+
+      // Check that the DMA this `amdaie.npu.dma_cpy_nd` operation is operating
+      // on, is not being touched within the same scope. Otherwise, the rewrite
+      // is not be valid in general as it would be changing the temporal usage
+      // of the source DMA.
+      Operation *parentOp = op->getParentOp();
+      if (!parentOp) return failure();
+      Value dma = npuDmaOp.getDma();
+      for (Operation *userOp : dma.getUsers()) {
+        if (userOp != op.getOperation() && parentOp->isProperAncestor(userOp)) {
+          return failure();
+        }
+      }
+    } else {
+      return failure();
+    }
+
+    if (isa<scf::ForOp>(op->getParentOp())) {
+      return rewriteWithForOpParent(op, rewriter, sourceMaxNbDims,
+                                    targetMaxNbDims);
+    } else if (isa<scf::ForallOp>(op->getParentOp())) {
+      return rewriteWithForallOpParent(op, rewriter, sourceMaxNbDims,
+                                       targetMaxNbDims);
+    } else {
+      return failure();
+    }
+  }
+};
+
+class AMDAIEDmaLoopSubsumptionPass
+    : public impl::AMDAIEDmaLoopSubsumptionBase<AMDAIEDmaLoopSubsumptionPass> {
+ public:
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AMDAIEDialect>();
+  }
+
+  AMDAIEDmaLoopSubsumptionPass() = default;
+  AMDAIEDmaLoopSubsumptionPass(const AMDAIEDmaLoopSubsumptionPass &pass){};
+  void runOnOperation() override;
+};
+
+void AMDAIEDmaLoopSubsumptionPass::runOnOperation() {
+  Operation *parentOp = getOperation();
+  MLIRContext *context = &getContext();
+  RewritePatternSet patterns(context);
+  patterns.insert<SubsumeLoopIntoDMA>(context);
+  if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) {
+    parentOp->emitOpError("failed to subsume some loops into DMA operations");
+    return signalPassFailure();
+  }
+
+  if (failed(moveUsersToHoistedDMAScope(parentOp))) {
+    parentOp->emitOpError(
+        "failed to move DMA users to correct scope after loop subsumption");
+    return signalPassFailure();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<Pass> createAMDAIEDmaLoopSubsumptionPass() {
+  return std::make_unique<AMDAIEDmaLoopSubsumptionPass>();
+}
+
+}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
index 189ea33b3..48015859d 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
@@ -55,6 +55,7 @@ iree_cc_library(
     "AMDAIECreateLogicalObjectFifoLink.cpp"
     "AMDAIECreateReferenceToAllocation.cpp"
     "AMDAIEDistributeCoresAndObjectFifos.cpp"
+    "AMDAIEDmaLoopSubsumption.cpp"
     "AMDAIEDmaToCircularDma.cpp"
     "AMDAIEDmaUtils.cpp"
     "AMDAIEFuseConsumerIntoLoop.cpp"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
index 4dda21cca..1b1d3276f 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
@@ -34,6 +34,7 @@ namespace mlir::iree_compiler::AMDAIE {
 #define GEN_PASS_DEF_AMDAIECREATEREFERENCETOALLOCATION
 #define GEN_PASS_DEF_AMDAIEDECOMPOSELINALGEXTPACKUNPACKTOAIR
 #define GEN_PASS_DEF_AMDAIEDISTRIBUTECORESANDOBJECTFIFOS
+#define GEN_PASS_DEF_AMDAIEDMALOOPSUBSUMPTION
 #define GEN_PASS_DEF_AMDAIEDMATOCIRCULARDMA
 #define GEN_PASS_DEF_AMDAIEFUSECONSUMERINTOLOOP
 #define GEN_PASS_DEF_AMDAIEFUSEFILLINTOFORALL
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
index a5536df39..615b412b5 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
@@ -101,6 +101,10 @@ std::unique_ptr<Pass> createAMDAIEDecomposeLinalgExtPackUnPackToAIRPass();
 /// operations and distribute the logical objectFifos.
 std::unique_ptr<Pass> createAMDAIEDistributeCoresAndObjectFifosPass();
 
+/// Create a pass to subsume loop iterations into DMA operations' access
+/// patterns.
+std::unique_ptr<Pass> createAMDAIEDmaLoopSubsumptionPass();
+
 /// Create a pass to convert dma operations to circular dma operations.
 std::unique_ptr<Pass> createAMDAIEDmaToCircularDmaPass();
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
index fad708a6e..1c2fd2faa 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
@@ -117,6 +117,12 @@ def AMDAIEDistributeCoresAndObjectFifos :
   let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEDistributeCoresAndObjectFifosPass()";
 }
 
+def AMDAIEDmaLoopSubsumption :
+  Pass<"iree-amdaie-dma-loop-subsumption"> {
+  let summary = "Subsume loop iterations into DMA operations' access patterns.";
+  let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEDmaLoopSubsumptionPass()";
+}
+
 def AMDAIEDmaToCircularDma :
   Pass<"iree-amdaie-dma-to-circular-dma"> {
   let summary = "Convert dma operations to circular dma operations.";
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
index 29e00d47b..ca0ada6b3 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
@@ -22,6 +22,7 @@ iree_lit_test_suite(
     "create_reference_to_allocation.mlir"
     "disable_vectorization.mlir"
     "distribute_cores_and_objectfifos.mlir"
+    "dma_loop_subsumption.mlir"
     "dma_to_circular_dma.mlir"
     "fuse_consumer_into_loop_scf_for.mlir"
     "fuse_consumer_into_loop_scf_forall.mlir"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir
new file mode 100644
index 000000000..3172a272a
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir
@@ -0,0 +1,807 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-amdaie-dma-loop-subsumption,canonicalize))" --split-input-file --verify-diagnostics %s | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// Sanity checks for cases where no modification should happen.
+//===----------------------------------------------------------------------===//
+
+// Sanity check: ensure no modification in case of no loop depedency
+// CHECK-LABEL: @npu_dma_cpy_nd_without_loop_dependency
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.forall (%{{.+}}, %{{.+}}) in (2, 2)
+// CHECK:           scf.for %{{.+}} = %[[C0]] to %[[C6]] step %[[C1]]
+// CHECK:             %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0] [1, 8, 16] [128, 16, 1], [] [] [])
+// CHECK:             amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @npu_dma_cpy_nd_without_loop_dependency(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.forall (%arg2, %arg3) in (2, 2) {
+        scf.for %arg4 = %c0 to %c6 step %c1 {
+          %1 = affine.apply #map(%arg4)
+          %2 = amdaie.npu.dma_cpy_nd %0([0, 0, 0] [1, 8, 16] [128, 16, 1], [] [] [])
+          amdaie.npu.dma_wait(%2, S2MM)
+        }
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// Ensure no modification in case of a dynamic offset not originating from an induction variable.
+// CHECK-LABEL: @dynamic_non_induction_var_offset
+// CHECK-SAME:  %{{.+}}: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %{{.+}}: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>, %[[ARG:.+]]: index
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.for %{{.+}} = %[[C0]] to %[[C6]] step %[[C1]]
+// CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[ARG]]] [16] [1], [] [] [])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+func.func @dynamic_non_induction_var_offset(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>, %arg2: index) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg3 = %c0 to %c6 step %c1 {
+        %2 = amdaie.npu.dma_cpy_nd %0([%arg2] [16] [1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// Ensure no modification in case of a invalid affine expressions.
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)>
+// CHECK:       #[[$MAP1:.+]] = affine_map<(d0) -> (d0 * 16 + 3)>
+// CHECK:       #[[$MAP2:.+]] = affine_map<(d0) -> (d0 + 3)>
+// CHECK:       #[[$MAP3:.+]] = affine_map<(d0) -> (d0 * 16 + 48)>
+// CHECK-LABEL: @invalid_affine_expr
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.for %[[ARG2:.+]] = %[[C1]] to %[[C6]] step %[[C2]]
+// CHECK:           %[[APPLY1:.+]] = affine.apply #[[$MAP]](%[[ARG2]])
+// CHECK:           %[[APPLY2:.+]] = affine.apply #[[$MAP1]](%[[ARG2]])
+// CHECK:           amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[APPLY2]]] [16] [1], [%[[APPLY1]]] [16] [1])
+// CHECK:           %[[APPLY3:.+]] = affine.apply #[[$MAP2]](%[[ARG2]])
+// CHECK:           amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[APPLY3]]] [16] [1], [] [] [])
+// CHECK:           %[[APPLY4:.+]] = affine.apply #[[$MAP3]](%[[ARG2]])
+// CHECK:           amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[APPLY4]]] [16] [1], [] [] [])
+#map = affine_map<(d0) -> (d0 * 16)>
+#map1 = affine_map<(d0) -> (d0 * 16 + 3)>
+#map2 = affine_map<(d0) -> (d0 + 3)>
+#map3 = affine_map<(d0) -> ((d0 + 3) * 16)>
+func.func @invalid_affine_expr(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c1 to %c6 step %c2 {
+        %1 = affine.apply #map(%arg2)
+        %2 = affine.apply #map1(%arg2)
+        %3 = amdaie.npu.dma_cpy_nd %0([%2] [16] [1], [%1] [16] [1])
+        %4 = affine.apply #map2(%arg2)
+        %5 = amdaie.npu.dma_cpy_nd %0([%4] [16] [1], [] [] [])
+        %6 = affine.apply #map3(%arg2)
+        %7 = amdaie.npu.dma_cpy_nd %0([%6] [16] [1], [] [] [])
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// Ensure no modification in case of too many dimensions, i.e. 4 existing
+// dimensions in the case of an `amdaie.npu.dma_cpy_nd` with target on L3.
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)>
+// CHECK-LABEL: @npu_dma_cpy_nd_too_many_dims_target
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]]
+// CHECK:           %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG2]])
+// CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0, %[[APPLY]]] [1, 1, 8, 16] [128, 128, 16, 1], [] [] [])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @npu_dma_cpy_nd_too_many_dims_target(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %1 = affine.apply #map(%arg2)
+        %2 = amdaie.npu.dma_cpy_nd %0([0, 0, 0, %1] [1, 1, 8, 16] [128, 128, 16, 1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// Ensure no modification in case of too many dimensions, i.e. 4 existing
+// dimensions in the case of an `amdaie.npu.dma_cpy_nd` with source on L3.
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)>
+// CHECK-LABEL: @npu_dma_cpy_nd_too_many_dims_source
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]]
+// CHECK:           %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG2]])
+// CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0, %[[APPLY]]] [1, 1, 8, 16] [128, 128, 16, 1])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @npu_dma_cpy_nd_too_many_dims_source(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %1 = affine.apply #map(%arg2)
+        %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, 0, %1] [1, 1, 8, 16] [128, 128, 16, 1])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// Ensure no modification in case of too many dimensions, i.e. 4 existing
+// dimensions in the case of an `amdaie.npu.dma_cpy_nd` with target on L2.
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)>
+// CHECK-LABEL: @npu_dma_cpy_nd_too_many_dims_target_on_l2
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]]
+// CHECK:           %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG2]])
+// CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0, %[[APPLY]]] [1, 1, 8, 16] [128, 128, 16, 1], [] [] [])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @npu_dma_cpy_nd_too_many_dims_target_on_l2(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %1 = affine.apply #map(%arg2)
+        %2 = amdaie.npu.dma_cpy_nd %0([0, 0, 0, %1] [1, 1, 8, 16] [128, 128, 16, 1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// Ensure no modification in case of too many dimensions, i.e. 4 existing
+// dimensions in the case of an `amdaie.npu.dma_cpy_nd` with source on L2.
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)>
+// CHECK-LABEL: @npu_dma_cpy_nd_too_many_dims_source_on_l2
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]]
+// CHECK:           %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG2]])
+// CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0, %[[APPLY]]] [1, 1, 8, 16] [128, 128, 16, 1])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @npu_dma_cpy_nd_too_many_dims_source_on_l2(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %1 = affine.apply #map(%arg2)
+        %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, 0, %1] [1, 1, 8, 16] [128, 128, 16, 1])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// Ensure no modification in case of too many dimensions, i.e. 3 existing
+// dimensions in the case of an `amdaie.npu.dma_cpy_nd` with target on L1.
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)>
+// CHECK-LABEL: @npu_dma_cpy_nd_too_many_dims_target_on_l1
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]]
+// CHECK:           %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG2]])
+// CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, %[[APPLY]]] [1, 8, 16] [128, 16, 1], [] [] [])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @npu_dma_cpy_nd_too_many_dims_target_on_l1(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %1 = affine.apply #map(%arg2)
+        %2 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// Ensure no modification in case of too many dimensions, i.e. 3 existing
+// dimensions in the case of an `amdaie.npu.dma_cpy_nd` with source on L1.
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)>
+// CHECK-LABEL: @npu_dma_cpy_nd_too_many_dims_source_on_l1
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]]
+// CHECK:           %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG2]])
+// CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, %[[APPLY]]] [1, 8, 16] [128, 16, 1])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @npu_dma_cpy_nd_too_many_dims_source_on_l1(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 2>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 2>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %1 = affine.apply #map(%arg2)
+        %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, %1] [1, 8, 16] [128, 16, 1])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// Ensure no modification in case of multiple npu.dma_cpy_nd users with the same source in the same scope.
+// CHECK-LABEL: @for_with_multiple_npu_dma_cpy_nd_same_source
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]]
+// CHECK:           %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG2]])
+// CHECK:           %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, %[[APPLY]]] [1, 8, 16] [128, 16, 1], [] [] [])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_0]], S2MM)
+// CHECK:           %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, %[[APPLY]]] [1, 8, 16] [128, 16, 1], [] [] [])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_1]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @for_with_multiple_npu_dma_cpy_nd_same_source(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %1 = affine.apply #map(%arg2)
+        %2 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+        %3 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] [])
+        amdaie.npu.dma_wait(%3, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// Ensure no modification in case of multiple npu.dma_cpy_nd users with the same source in the same scope.
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)>
+// CHECK-LABEL: @forall_with_multiple_npu_dma_cpy_nd_same_source
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.forall (%[[ARG2:.+]], %[[ARG3:.+]]) in (2, 6)
+// CHECK:           %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG3]])
+// CHECK:           %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, %[[APPLY]]] [1, 8, 16] [128, 16, 1], [] [] [])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_0]], S2MM)
+// CHECK:           %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, %[[APPLY]]] [1, 8, 16] [128, 16, 1], [] [] [])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_1]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @forall_with_multiple_npu_dma_cpy_nd_same_source(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.forall (%arg2, %arg3) in (2, 6) {
+        %1 = affine.apply #map(%arg3)
+        %2 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+        %3 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] [])
+        amdaie.npu.dma_wait(%3, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+//===----------------------------------------------------------------------===//
+// Checks for dependencies via `affine.apply` on both source and target sides.
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// Check that loop subsumption happens in case of an identity affine expression.
+// CHECK-LABEL: @identity_affine_expr
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:     scf.for
+// CHECK:         amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]]] [%[[C6]], %[[C16]]] [%[[C1]], %[[C1]]], [] [] [])
+#map = affine_map<(d0) -> (d0)>
+func.func @identity_affine_expr(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %1 = affine.apply #map(%arg2)
+        %3 = amdaie.npu.dma_cpy_nd %0([%1] [16] [1], [] [] [])
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @for_dependency_on_target
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[C128:.+]] = arith.constant 128 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.for
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C1]], %[[C8]], %[[C16]]] [%[[C16]], %[[C128]], %[[C16]], %[[C1]]], [] [] [])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @for_dependency_on_target(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %1 = affine.apply #map(%arg2)
+        %2 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @forall_dependency_on_target
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[C128:.+]] = arith.constant 128 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.forall
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C1]], %[[C8]], %[[C16]]] [%[[C16]], %[[C128]], %[[C16]], %[[C1]]], [] [] [])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (16 * d0)>
+func.func @forall_dependency_on_target(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.forall (%arg2, %arg3) in (2, 6) {
+        %1 = affine.apply #map(%arg3)
+        %2 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @for_dependency_on_source
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[C128:.+]] = arith.constant 128 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.for
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C1]], %[[C8]], %[[C16]]] [%[[C16]], %[[C128]], %[[C16]], %[[C1]]])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @for_dependency_on_source(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %1 = affine.apply #map(%arg2)
+        %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, %1] [1, 8, 16] [128, 16, 1])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @forall_dependency_on_source
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[C128:.+]] = arith.constant 128 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.forall
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C1]], %[[C8]], %[[C16]]] [%[[C16]], %[[C128]], %[[C16]], %[[C1]]])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @forall_dependency_on_source(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.forall (%arg2, %arg3) in (2, 6) {
+        %1 = affine.apply #map(%arg3)
+        %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, %1] [1, 8, 16] [128, 16, 1])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// Check with multiple `affine.apply` usages in a `amdaie.npu.dma_cpy_nd` operation.
+// CHECK-LABEL: @multiple_for_dependencies
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[C256:.+]] = arith.constant 256 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.for
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C6]], %[[C8]], %[[C16]]] [%[[C256]], %[[C16]], %[[C16]], %[[C1]]], [] [] [])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @multiple_for_dependencies(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %1 = affine.apply #map(%arg2)
+        %2 = amdaie.npu.dma_cpy_nd %0([%1, %1] [8, 16] [16, 1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @multiple_forall_dependencies
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[C512:.+]] = arith.constant 512 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.forall
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C6]], %[[C8]], %[[C16]]] [%[[C16]], %[[C512]], %[[C16]], %[[C1]]], [] [] [])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+#map1 = affine_map<(d0) -> (d0 * 32)>
+func.func @multiple_forall_dependencies(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.forall (%arg2, %arg3) in (2, 6) {
+        %1 = affine.apply #map(%arg2)
+        %2 = affine.apply #map1(%arg3)
+        %3 = amdaie.npu.dma_cpy_nd %0([%2, %1] [8, 16] [16, 1], [] [] [])
+        amdaie.npu.dma_wait(%3, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @non_normalized_for_with_affine
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[C32:.+]] = arith.constant 32 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.for
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C16]]] [%[[C3]], %[[C16]]] [%[[C32]], %[[C1]]], [] [] [])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+func.func @non_normalized_for_with_affine(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c1 to %c6 step %c2 {
+        %1 = affine.apply #map(%arg2)
+        %2 = amdaie.npu.dma_cpy_nd %0([%1] [16] [1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @non_normalized_forall_with_affine
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:   %[[C5:.+]] = arith.constant 5 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[C32:.+]] = arith.constant 32 : index
+// CHECK-DAG:   %[[C48:.+]] = arith.constant 48 : index
+// CHECK-DAG:   %[[C1024:.+]] = arith.constant 1024 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.forall
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C32]], %[[C32]]] [%[[C5]], %[[C4]], %[[C8]], %[[C16]]] [%[[C48]], %[[C1024]], %[[C16]], %[[C1]]], [] [] [])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+#map1 = affine_map<(d0) -> (d0 * 32)>
+func.func @non_normalized_forall_with_affine(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.forall (%arg2, %arg3) = (2, 1) to (17, 8) step (3, 2) {
+        %1 = affine.apply #map(%arg2)
+        %2 = affine.apply #map1(%arg3)
+        %3 = amdaie.npu.dma_cpy_nd %0([%2, %1] [8, 16] [16, 1], [] [] [])
+        amdaie.npu.dma_wait(%3, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+//===----------------------------------------------------------------------===//
+// Checks for dependencies via induction variables (no affine.apply) on both 
+// source and target sides.
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: @for_with_induction_var_normalized
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.for
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]]] [%[[C6]], %[[C16]]] [%[[C1]], %[[C1]]], [] [] [])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+func.func @for_with_induction_var_normalized(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c0 to %c6 step %c1 {
+        %2 = amdaie.npu.dma_cpy_nd %0([%arg2] [16] [1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @for_with_induction_var_non_normalized
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.for
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C1]]] [%[[C3]], %[[C16]]] [%[[C2]], %[[C1]]], [] [] [])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+func.func @for_with_induction_var_non_normalized(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c6 = arith.constant 6 : index
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.for %arg2 = %c1 to %c6 step %c2 {
+        %2 = amdaie.npu.dma_cpy_nd %0([%arg2] [16] [1], [] [] [])
+        amdaie.npu.dma_wait(%2, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @forall_with_induction_var_normalized
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[C17:.+]] = arith.constant 17 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.forall
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C17]], %[[C8]], %[[C8]], %[[C16]]] [%[[C1]], %[[C16]], %[[C16]], %[[C1]]], [] [] [])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+func.func @forall_with_induction_var_normalized(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.forall (%arg2, %arg3) in (17, 8) {
+        %3 = amdaie.npu.dma_cpy_nd %0([%arg3, %arg2] [8, 16] [16, 1], [] [] [])
+        amdaie.npu.dma_wait(%3, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @forall_with_induction_var_non_normalized
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
+// CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:   %[[C5:.+]] = arith.constant 5 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[C32:.+]] = arith.constant 32 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:   scf.forall
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C1]], %[[C2]]] [%[[C5]], %[[C4]], %[[C8]], %[[C16]]] [%[[C3]], %[[C32]], %[[C16]], %[[C1]]], [] [] [])
+// CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+func.func @forall_with_induction_var_non_normalized(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+  amdaie.workgroup {
+    %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+    amdaie.controlcode {
+      scf.forall (%arg2, %arg3) = (2, 1) to (17, 8) step (3, 2) {
+        %3 = amdaie.npu.dma_cpy_nd %0([%arg3, %arg2] [8, 16] [16, 1], [] [] [])
+        amdaie.npu.dma_wait(%3, S2MM)
+      }
+      amdaie.end
+    }
+  }
+  return
+}
diff --git a/tests/samples/matmul_peeled_objectfifo.mlir b/tests/samples/matmul_peeled_objectfifo.mlir
index 47a567dff..b99349793 100644
--- a/tests/samples/matmul_peeled_objectfifo.mlir
+++ b/tests/samples/matmul_peeled_objectfifo.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-pack-to-dma,air-copy-to-dma,iree-amdaie-air-dma-to-amdaie-dma,iree-amdaie-insert-cores,cse,iree-amdaie-localize-logicalobjectfifo,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize,iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-access-to-acquire-release,cse,canonicalize,iree-amdaie-controlcode-loop-unroll,cse,canonicalize,iree-amdaie-create-logical-objectfifo-link,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-lower-to-aie,canonicalize)" --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-pack-to-dma,air-copy-to-dma,iree-amdaie-air-dma-to-amdaie-dma,iree-amdaie-insert-cores,cse,iree-amdaie-localize-logicalobjectfifo,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize,iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-access-to-acquire-release,cse,canonicalize,iree-amdaie-dma-loop-subsumption,cse,canonicalize,iree-amdaie-controlcode-loop-unroll,cse,canonicalize,iree-amdaie-create-logical-objectfifo-link,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-lower-to-aie,canonicalize)" --split-input-file %s | FileCheck %s
 
 // CHECK:       aie.device(npu1_4col)
 // CHECK-DAG:   %[[TILE_0_2:.+]] = aie.tile(0, 2)