diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh
index 5c30d8579..af51628a8 100755
--- a/build_tools/ci/run_matmul_test.sh
+++ b/build_tools/ci/run_matmul_test.sh
@@ -735,6 +735,7 @@ run_matmul_test \
     --acc_type "f32" \
     --m "8192" --k "2432" --n "7296"
 
+###################################################################
 # ObjectFifo Matmul tests
 ###################################################################
 
@@ -744,7 +745,7 @@ run_matmul_test \
     --tile_pipeline "pack-peel" \
     --lhs_rhs_type "i32" \
     --acc_type "i32" \
-    --m "128" --k "256" --n "128"
+    --m "32" --k "32" --n "32"
 
 run_matmul_test \
     --name_prefix "small" \
@@ -752,7 +753,16 @@ run_matmul_test \
     --tile_pipeline "pack-peel" \
     --lhs_rhs_type "i32" \
     --acc_type "i32" \
-    --m "32" --k "32" --n "32"
+    --m "128" --k "32" --n "64" \
+    --num_repeat_runs "10"
+
+run_matmul_test \
+    --name_prefix "small" \
+    --lower_to_aie_pipeline "objectFifo" \
+    --tile_pipeline "pack-peel" \
+    --lhs_rhs_type "i32" \
+    --acc_type "i32" \
+    --m "128" --k "256" --n "128"
 
 run_matmul_test \
     --name_prefix "medium" \
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp
index 402661116..b6de094f7 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp
@@ -34,6 +34,18 @@
 
 namespace mlir::iree_compiler::AMDAIE {
 
+/// Utility to calculate the number of iterations of a loop with provided bounds
+/// and step: `ceilDiv(upperBound - lowerBound, step)`.
+int64_t calculateNbIterations(int64_t lowerBound, int64_t upperBound,
+                              int64_t step) {
+  int64_t diff = upperBound - lowerBound;
+  assert(diff > 0 &&
+         "expected positive difference between upper bound and lower "
+         "bound");
+  assert(step > 0 && "expected positive step");
+  return 1 + ((diff - 1) / step);
+}
+
 // Constant specifying the number of inter-iteration dimension for DMA
 // operations.
 //
@@ -117,13 +129,23 @@ class SubsumeLoopIntoDMA
   using OpInterfaceRewritePattern::OpInterfaceRewritePattern;
 
   /// Utility to add a loop iteration to an offsets/sizes/strides access
-  /// pattern.
+  /// pattern. This function handles following cases:
+  /// 1. If an offset which has an loop induction variable dependency can be
+  /// found, calculate the stride and size based on the dependency, potentially
+  /// taking into account an affine expression multiplier.
+  /// 2. If there is no loop induction variable dependency, the iteration means
+  /// that this strided operation is repeated `ceilDiv(upperBound - lowerBound,
+  /// step)` number of times, so a new dimension is added to the access pattern
+  /// with `stride == 0` and `size == ceilDiv(upperBound - lowerBound, step)`.
   LogicalResult addIterationToAccessPattern(
       RewriterBase &rewriter, int64_t lowerBound, int64_t upperBound,
       int64_t step, const DenseSet<Value> &inductionValues,
       SmallVector<OpFoldResult> &newOffsets,
       SmallVector<OpFoldResult> &newSizes,
       SmallVector<OpFoldResult> &newStrides) const {
+    const int64_t nbIterations =
+        calculateNbIterations(lowerBound, upperBound, step);
+
     SmallVector<OpFoldResult> insertOffsets;
     SmallVector<OpFoldResult> insertSizes;
     SmallVector<OpFoldResult> insertStrides;
@@ -159,24 +181,31 @@ class SubsumeLoopIntoDMA
 
         newOffsets[i] = getAsIndexOpFoldResult(rewriter.getContext(),
                                                lowerBound * offsetStride);
+
+        // Don't add a unit iteration to better use available dimensions.
+        // However, the current offset should be updated, therefore this check
+        // is placed after `newOffsets[i]` has been updated.
+        if (nbIterations == 1) continue;
+
         insertOffsets.push_back(
             getAsIndexOpFoldResult(rewriter.getContext(), 0));
-
-        // The step size is equal to the the number of iterations
-        // (ceilDiv(upperBound - lowerBound, step))
-        int64_t diff = upperBound - lowerBound;
-        assert(diff > 0 &&
-               "expected positive difference between upper bound and lower "
-               "bound");
-        assert(step > 0 && "expected positive step");
-        int64_t newSize = 1 + ((diff - 1) / step);
         insertSizes.push_back(
-            getAsIndexOpFoldResult(rewriter.getContext(), newSize));
-
+            getAsIndexOpFoldResult(rewriter.getContext(), nbIterations));
         insertStrides.push_back(
             getAsIndexOpFoldResult(rewriter.getContext(), stride));
       }
     }
+    assert(insertOffsets.size() == insertSizes.size() &&
+           "expected same number of offsets and sizes to be inserted");
+    assert(insertOffsets.size() == insertStrides.size() &&
+           "expected same number of offsets and strides to be inserted");
+    // Handle the 'no loop dependency' case.
+    if (insertOffsets.empty() && nbIterations != 1) {
+      insertOffsets.push_back(getAsIndexOpFoldResult(rewriter.getContext(), 0));
+      insertSizes.push_back(
+          getAsIndexOpFoldResult(rewriter.getContext(), nbIterations));
+      insertStrides.push_back(getAsIndexOpFoldResult(rewriter.getContext(), 0));
+    }
     newOffsets.insert(newOffsets.begin(), insertOffsets.begin(),
                       insertOffsets.end());
     newSizes.insert(newSizes.begin(), insertSizes.begin(), insertSizes.end());
@@ -206,40 +235,38 @@ class SubsumeLoopIntoDMA
     SmallVector<OpFoldResult> newTargetSizes = op.getTargetMixedSizes();
     SmallVector<OpFoldResult> newTargetStrides = op.getTargetMixedStrides();
 
-    // Use source/target maxNbDims to check whether there are sufficient source
-    // and target dimensions. Otherwise, abort.
-    auto verifyNbDimsNeeded = [&](const SmallVector<Value> &dynamicOffsets,
-                                  size_t nbOffsets,
-                                  size_t maxNbDims) -> LogicalResult {
-      size_t counter = 0;
-      for (Value offset : dynamicOffsets)
-        if (allInductionValues.contains(offset)) counter++;
-      if (nbOffsets + counter > maxNbDims) return failure();
-      return success();
-    };
-    SmallVector<Value> dynamicSourceOffsets = op.getSourceOffsets();
-    SmallVector<Value> dynamicTargetOffsets = op.getTargetOffsets();
-    if (failed(verifyNbDimsNeeded(dynamicSourceOffsets, newSourceOffsets.size(),
-                                  sourceMaxNbDims)))
+    // Verify number of dimensions needed to subsume this loop into the strided
+    // access pattern and fail early if there aren't enough dimensions.
+    size_t nbNonUnitIterations{0};
+    for (auto &&[lb, ub, step] : llvm::zip(lowerBounds, upperBounds, steps)) {
+      const int64_t nbIterations = calculateNbIterations(lb, ub, step);
+      // We should not do any rewrite if we encounter a loop with no iterations.
+      if (nbIterations == 0) return failure();
+      if (nbIterations > 1) nbNonUnitIterations++;
+    }
+    if (newSourceOffsets.size() + nbNonUnitIterations > sourceMaxNbDims)
       return failure();
-    if (failed(verifyNbDimsNeeded(dynamicTargetOffsets, newTargetOffsets.size(),
-                                  targetMaxNbDims)))
+    if (newTargetOffsets.size() + nbNonUnitIterations > targetMaxNbDims)
       return failure();
 
     // Add the loop iterations to the DMA access patterns.
     for (auto &&[lb, ub, step, iterationIvValues] : llvm::reverse(
              llvm::zip(lowerBounds, upperBounds, steps, inductionValues))) {
       // Add loop iteration to the access pattern on the source side.
-      if (failed(addIterationToAccessPattern(
-              rewriter, lb, ub, step, iterationIvValues, newSourceOffsets,
-              newSourceSizes, newSourceStrides))) {
-        return failure();
+      if (!newSourceOffsets.empty()) {
+        if (failed(addIterationToAccessPattern(
+                rewriter, lb, ub, step, iterationIvValues, newSourceOffsets,
+                newSourceSizes, newSourceStrides))) {
+          return failure();
+        }
       }
       // Add loop iteration to the access pattern on the target side.
-      if (failed(addIterationToAccessPattern(
-              rewriter, lb, ub, step, iterationIvValues, newTargetOffsets,
-              newTargetSizes, newTargetStrides))) {
-        return failure();
+      if (!newTargetOffsets.empty()) {
+        if (failed(addIterationToAccessPattern(
+                rewriter, lb, ub, step, iterationIvValues, newTargetOffsets,
+                newTargetSizes, newTargetStrides))) {
+          return failure();
+        }
       }
     }
 
@@ -290,11 +317,6 @@ class SubsumeLoopIntoDMA
         curIvValues.insert(userApplyOp.getResult());
       }
     }
-    if (!llvm::any_of(op->getOperands(), [&](Value operand) {
-          return curIvValues.contains(operand);
-        })) {
-      return failure();
-    }
 
     SmallVector<int64_t> lowerBounds = {lowerBound.value()};
     SmallVector<int64_t> upperBounds = {upperBound.value()};
@@ -342,13 +364,6 @@ class SubsumeLoopIntoDMA
       }
       inductionValues.push_back(curIvValues);
     }
-    // Return early if the strided operation doesn't use any of the
-    // induction variable dependent values.
-    if (!llvm::any_of(op->getOperands(), [&](Value operand) {
-          return allInductionValues.contains(operand);
-        })) {
-      return failure();
-    }
     return rewriteWithLoopLikeOpParent(op, rewriter, sourceMaxNbDims,
                                        targetMaxNbDims, lowerBounds.value(),
                                        upperBounds.value(), steps.value(),
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir
index 4ee6f640d..694ecf3f4 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir
@@ -4,76 +4,6 @@
 // Sanity checks for cases where no modification should happen.
 //===----------------------------------------------------------------------===//
 
-// Sanity check: ensure no modification in case of no loop depedency
-// CHECK-LABEL: @npu_dma_cpy_nd_without_loop_dependency
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
-// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
-// CHECK:       amdaie.controlcode
-// CHECK:         scf.forall (%{{.+}}, %{{.+}}) in (2, 2)
-// CHECK:           scf.for %{{.+}} = %[[C0]] to %[[C6]] step %[[C1]]
-// CHECK:             %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0] [1, 8, 16] [128, 16, 1], [] [] [])
-// CHECK:             amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
-#map = affine_map<(d0) -> (d0 * 16)>
-#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
-module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @npu_dma_cpy_nd_without_loop_dependency(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %c6 = arith.constant 6 : index
-    amdaie.workgroup {
-      %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-      amdaie.controlcode {
-        scf.forall (%arg2, %arg3) in (2, 2) {
-          scf.for %arg4 = %c0 to %c6 step %c1 {
-            %1 = affine.apply #map(%arg4)
-            %2 = amdaie.npu.dma_cpy_nd %0([0, 0, 0] [1, 8, 16] [128, 16, 1], [] [] [])
-            amdaie.npu.dma_wait(%2, S2MM)
-          }
-        }
-        amdaie.end
-      }
-    }
-    return
-  }
-}
-
-// -----
-
-// Ensure no modification in case of a dynamic offset not originating from an induction variable.
-// CHECK-LABEL: @dynamic_non_induction_var_offset
-// CHECK-SAME:  %{{.+}}: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %{{.+}}: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>, %[[ARG:.+]]: index
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
-// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
-// CHECK:       amdaie.controlcode
-// CHECK:         scf.for %{{.+}} = %[[C0]] to %[[C6]] step %[[C1]]
-// CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[ARG]]] [16] [1], [] [] [])
-// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
-#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
-module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @dynamic_non_induction_var_offset(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>, %arg2: index) {
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %c6 = arith.constant 6 : index
-    amdaie.workgroup {
-      %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-      amdaie.controlcode {
-        scf.for %arg3 = %c0 to %c6 step %c1 {
-          %2 = amdaie.npu.dma_cpy_nd %0([%arg2] [16] [1], [] [] [])
-          amdaie.npu.dma_wait(%2, S2MM)
-        }
-        amdaie.end
-      }
-    }
-    return
-  }
-}
-
-// -----
-
 // Ensure no modification in case of a invalid affine expressions.
 // CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)>
 // CHECK:       #[[$MAP1:.+]] = affine_map<(d0) -> (d0 * 16 + 3)>
@@ -346,6 +276,37 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
+// Ensure no modification in case of an scf.forall with too many dimensions,
+// i.e. 3 existing dimensions and two loop iterators.
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)>
+// CHECK-LABEL: @forall_too_many_dims_target
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK:         scf.forall (%[[ARG2:.+]], %[[ARG3:.+]]) in (2, 6)
+// CHECK:           %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG3]])
+// CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, %[[APPLY]]] [1, 8, 16] [128, 16, 1], [] [] [])
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @forall_too_many_dims_target(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+    amdaie.workgroup {
+      %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+      amdaie.controlcode {
+        scf.forall (%arg2, %arg3) in (2, 6) {
+          %1 = affine.apply #map(%arg3)
+          %2 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] [])
+          amdaie.npu.dma_wait(%2, S2MM)
+        }
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
 // Ensure no modification in case of multiple npu.dma_cpy_nd users with the same source in the same scope.
 // CHECK-LABEL: @for_with_multiple_npu_dma_cpy_nd_same_source
 // CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
@@ -417,12 +378,161 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
   }
 }
 
+// -----
+
 //===----------------------------------------------------------------------===//
-// Checks for dependencies via `affine.apply` on both source and target sides.
+// Checks for loops with no dependencies, which should be subsumed. 
 //===----------------------------------------------------------------------===//
 
+// Subsume loop iteration into strided op without dependency.
+// CHECK-LABEL: @for_without_loop_dependency
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[C128:.+]] = arith.constant 128 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:     scf.for
+// CHECK:         %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C1]], %[[C8]], %[[C16]]] [%[[C0]], %[[C128]], %[[C16]], %[[C1]]], [] [] [])
+// CHECK:         amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @for_without_loop_dependency(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c6 = arith.constant 6 : index
+    amdaie.workgroup {
+      %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+      amdaie.controlcode {
+        scf.for %arg4 = %c0 to %c6 step %c1 {
+          %1 = affine.apply #map(%arg4)
+          %2 = amdaie.npu.dma_cpy_nd %0([0, 0, 0] [1, 8, 16] [128, 16, 1], [] [] [])
+          amdaie.npu.dma_wait(%2, S2MM)
+        }
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
 // -----
 
+// Subsume loop iteration into strided op without dependency.
+// CHECK-LABEL: @forall_without_loop_dependency
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:     scf.forall (%{{.+}}, %{{.+}}) in (2, 2)
+// CHECK:         %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C2]], %[[C8]], %[[C16]]] [%[[C0]], %[[C0]], %[[C16]], %[[C1]]], [] [] [])
+// CHECK:         amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @forall_without_loop_dependency(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+    amdaie.workgroup {
+      %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+      amdaie.controlcode {
+        scf.forall (%arg2, %arg3) in (2, 2) {
+          %1 = affine.apply #map(%arg2)
+          %2 = amdaie.npu.dma_cpy_nd %0([0, 0] [8, 16] [16, 1], [] [] [])
+          amdaie.npu.dma_wait(%2, S2MM)
+        }
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// Subsume loop iteration into strided op without dependency.
+// CHECK-LABEL: @nested_without_loop_dependency
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:     scf.forall
+// CHECK-NOT:     scf.for
+// CHECK:         %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C3]], %[[C6]], %[[C16]]] [%[[C0]], %[[C0]], %[[C0]], %[[C1]]], [] [] [])
+// CHECK:         amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#map = affine_map<(d0) -> (d0 * 16)>
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @nested_without_loop_dependency(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c6 = arith.constant 6 : index
+    amdaie.workgroup {
+      %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+      amdaie.controlcode {
+        scf.forall (%arg2, %arg3) in (2, 3) {
+          scf.for %arg4 = %c0 to %c6 step %c1 {
+            %1 = affine.apply #map(%arg4)
+            %2 = amdaie.npu.dma_cpy_nd %0([0] [16] [1], [] [] [])
+            amdaie.npu.dma_wait(%2, S2MM)
+          }
+        }
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// Subsume loop into stride op in case of a dynamic offset not originating from
+// an induction variable.
+// CHECK-LABEL: @dynamic_non_induction_var_offset
+// CHECK-SAME:  %{{.+}}: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %{{.+}}: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>, %[[ARG:.+]]: index
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+// CHECK-NOT:     scf.for
+// CHECK:         %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[ARG]]] [%[[C6]], %[[C16]]] [%[[C0]], %[[C1]]], [] [] [])
+// CHECK:         amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @dynamic_non_induction_var_offset(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>, %arg2: index) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c6 = arith.constant 6 : index
+    amdaie.workgroup {
+      %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
+      amdaie.controlcode {
+        scf.for %arg3 = %c0 to %c6 step %c1 {
+          %2 = amdaie.npu.dma_cpy_nd %0([%arg2] [16] [1], [] [] [])
+          amdaie.npu.dma_wait(%2, S2MM)
+        }
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// Checks for dependencies via `affine.apply` on both source and target sides.
+//===----------------------------------------------------------------------===//
+
 // Check that loop subsumption happens in case of an identity affine expression.
 // CHECK-LABEL: @identity_affine_expr
 // CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
@@ -495,14 +605,14 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK-LABEL: @forall_dependency_on_target
 // CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 // CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
 // CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
 // CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
 // CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
-// CHECK-DAG:   %[[C128:.+]] = arith.constant 128 : index
 // CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
 // CHECK:       amdaie.controlcode
 // CHECK-NOT:   scf.forall
-// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C1]], %[[C8]], %[[C16]]] [%[[C16]], %[[C128]], %[[C16]], %[[C1]]], [] [] [])
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C6]], %[[C8]], %[[C16]]] [%[[C0]], %[[C16]], %[[C16]], %[[C1]]], [] [] [])
 // CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
 #map = affine_map<(d0) -> (16 * d0)>
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
@@ -513,7 +623,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       amdaie.controlcode {
         scf.forall (%arg2, %arg3) in (2, 6) {
           %1 = affine.apply #map(%arg3)
-          %2 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] [])
+          %2 = amdaie.npu.dma_cpy_nd %0([0, %1] [8, 16] [16, 1], [] [] [])
           amdaie.npu.dma_wait(%2, S2MM)
         }
         amdaie.end
@@ -564,14 +674,14 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK-LABEL: @forall_dependency_on_source
 // CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 // CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
 // CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
 // CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
 // CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
-// CHECK-DAG:   %[[C128:.+]] = arith.constant 128 : index
 // CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
 // CHECK:       amdaie.controlcode
 // CHECK-NOT:   scf.forall
-// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C1]], %[[C8]], %[[C16]]] [%[[C16]], %[[C128]], %[[C16]], %[[C1]]])
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C6]], %[[C8]], %[[C16]]] [%[[C0]], %[[C16]], %[[C16]], %[[C1]]])
 // CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
 #map = affine_map<(d0) -> (d0 * 16)>
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
@@ -582,7 +692,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       amdaie.controlcode {
         scf.forall (%arg2, %arg3) in (2, 6) {
           %1 = affine.apply #map(%arg3)
-          %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, %1] [1, 8, 16] [128, 16, 1])
+          %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, %1] [8, 16] [16, 1])
           amdaie.npu.dma_wait(%2, S2MM)
         }
         amdaie.end
@@ -753,16 +863,16 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK-LABEL: @nested_dependencies
 // CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 // CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
 // CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
 // CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
 // CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
-// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
 // CHECK-DAG:   %[[C32:.+]] = arith.constant 32 : index
 // CHECK:       %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
 // CHECK:       amdaie.controlcode
 // CHECK-NOT:   scf.forall
 // CHECK-NOT:   scf.for
-// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C1]], %[[C0]]] [%[[C6]], %[[C3]], %[[C16]], %[[C8]]] [%[[C32]], %[[C32]], %[[C16]], %[[C1]]], [] [] [])
+// CHECK:       %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C6]], %[[C3]], %[[C8]]] [%[[C0]], %[[C32]], %[[C0]], %[[C1]]], [] [] [])
 // CHECK:       amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM)
 #map = affine_map<(d0) -> (d0 * 16)>
 #map1 = affine_map<(d0) -> (d0 * 32)>
@@ -779,7 +889,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
           %1 = affine.apply #map(%arg2)
           %2 = affine.apply #map1(%arg3)
           scf.for %arg4 = %c1 to %c6 step %c2 {
-            %3 = amdaie.npu.dma_cpy_nd %0([%arg4, %2] [16, 8] [16, 1], [] [] [])
+            %3 = amdaie.npu.dma_cpy_nd %0([%2] [8] [1], [] [] [])
             amdaie.npu.dma_wait(%3, S2MM)
           }
         }
diff --git a/tests/samples/matmul_peeled_objectfifo.mlir b/tests/samples/matmul_peeled_objectfifo.mlir
index 7651e484b..c3c9e6d93 100644
--- a/tests/samples/matmul_peeled_objectfifo.mlir
+++ b/tests/samples/matmul_peeled_objectfifo.mlir
@@ -10,10 +10,8 @@
 // CHECK-DAG:   aie.core(%[[TILE_1_2]])
 // CHECK:         aie.objectfifo.acquire @[[OBJ1]](Produce, 1)
 // CHECK:       func.func @matmul_i32(%[[ARG0:.+]]: memref<32x1024xi32>, %[[ARG1:.+]]: memref<1024x64xi32>, %[[ARG2:.+]]: memref<32x64xi32>)
-// CHECK-DAG:     aiex.npu.dma_memcpy_nd
-// CHECK-SAME:    %[[ARG0]]
-// CHECK-DAG:     aiex.npu.dma_memcpy_nd
-// CHECK-SAME:    %[[ARG1]]
+// CHECK-DAG:     aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 1, 32, 64][0, 0, 1024, 1]
+// CHECK-DAG:     aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][1, 1, 64, 32][0, 0, 64, 1]
 // CHECK-DAG:     aiex.npu.dma_memcpy_nd(0, 0, %[[ARG2]][0, 0, 0, 0][1, 1, 32, 32][0, 0, 64, 1]
 // CHECK-DAG:     aiex.npu.dma_memcpy_nd(0, 0, %[[ARG2]][0, 0, 0, 32][1, 1, 32, 32][0, 0, 64, 1]
 #map = affine_map<(d0) -> (d0 * 32)>