diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
index be241be1c..ac0cfb3cc 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
@@ -1101,7 +1101,7 @@ void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result,
                               Value input, ArrayRef<OpFoldResult> offsets,
                               ArrayRef<OpFoldResult> sizes,
                               ArrayRef<OpFoldResult> strides, Value bdId,
-                              Value channel, bool useNextBd, Value nextBd,
+                              Value channel, BoolAttr useNextBd, Value nextBd,
                               Value startBd) {
   SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
   SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
@@ -1119,7 +1119,7 @@ void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result,
                               Value input, ArrayRef<int64_t> offsets,
                               ArrayRef<int64_t> sizes,
                               ArrayRef<int64_t> strides, mlir::Value bdId,
-                              Value channel, bool useNextBd, Value nextBd,
+                              Value channel, BoolAttr useNextBd, Value nextBd,
                               Value startBd) {
   SmallVector<OpFoldResult> offsetValues = llvm::to_vector<4>(llvm::map_range(
       offsets,
@@ -1140,7 +1140,7 @@ void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result,
                               TypeRange resultTypes, Value connection,
                               Value input, ValueRange offsets, ValueRange sizes,
                               ValueRange strides, mlir::Value bdId,
-                              Value channel, bool useNextBd, Value nextBd,
+                              Value channel, BoolAttr useNextBd, Value nextBd,
                               Value startBd) {
   SmallVector<OpFoldResult> offsetValues = llvm::to_vector<4>(
       llvm::map_range(offsets, [](Value v) -> OpFoldResult { return v; }));
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
index d75bd8341..5e26c9a68 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
@@ -595,10 +595,15 @@ def AMDAIE_NpuHalfDmaCpyNdOp
     `next_bd`, and `start_bd` operands. The `use_next_bd` operand indicates 
     whether another DMA operation is chained to follow this one. 
     If `use_next_bd` is `true`, the `next_bd` operand specifies the BD ID of 
-    the next DMA operation in the chain. Within a chain, the `start_bd` operand
-    identifies the BD ID of the first DMA operation in the sequence. 
-    When `use_next_bd` is `false`, the `start_bd` is set to the same value as `bd_id`.
-
+    the next DMA operation in the chain. 
+    
+    The `start_bd` operand specifies the BD ID of the first DMA operation in a sequence.
+    - If `start_bd` is the same as `bd_id`, it marks the start of a chain.
+    - If `start_bd` differs from `bd_id` and `use_next_bd` is `true`, it represents 
+      an intermediate operation in the chain.
+    - If `start_bd` differs from `bd_id` and `use_next_bd` is `false`, it represents 
+      the end of the chain.
+      
     Example:
 
     ```mlir
@@ -629,7 +634,7 @@ def AMDAIE_NpuHalfDmaCpyNdOp
         DenseI64ArrayAttr:$static_strides,
         Optional<Index>:$bd_id,
         Optional<Index>:$channel,
-        BoolAttr:$use_next_bd,
+        OptionalAttr<BoolAttr>:$use_next_bd,
         Optional<Index>:$next_bd,
         Optional<Index>:$start_bd
   );
@@ -646,7 +651,7 @@ def AMDAIE_NpuHalfDmaCpyNdOp
     custom<DynamicIndexList>($strides, $static_strides)
     (`bd_id` `=` $bd_id^)?
     (`channel` `=` $channel^)?
-    `use_next_bd` `=` $use_next_bd
+    (`use_next_bd` `=` $use_next_bd^)?
     (`next_bd` `=` $next_bd^)?
     (`start_bd` `=` $start_bd^)?
     `)`
@@ -660,18 +665,18 @@ def AMDAIE_NpuHalfDmaCpyNdOp
       "::mlir::Value":$input, "ArrayRef<OpFoldResult>":$offsets,
       "ArrayRef<OpFoldResult>":$sizes, "ArrayRef<OpFoldResult>":$strides,
       "::mlir::Value":$bd_id, "::mlir::Value":$channel,
-      "bool":$use_next_bd, "::mlir::Value":$next_bd, "::mlir::Value":$start_bd)>,
+      "::mlir::BoolAttr":$use_next_bd, "::mlir::Value":$next_bd, "::mlir::Value":$start_bd)>,
     // Build a NpuHalfDmaCpyNdOp with static entries.
     OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection,
       "::mlir::Value":$target, "ArrayRef<int64_t>":$offsets,
       "ArrayRef<int64_t>":$sizes, "ArrayRef<int64_t>":$strides,
       "::mlir::Value":$bd_id, "::mlir::Value":$channel,
-      "bool":$use_next_bd, "::mlir::Value":$next_bd, "::mlir::Value":$start_bd)>,
+      "::mlir::BoolAttr":$use_next_bd, "::mlir::Value":$next_bd, "::mlir::Value":$start_bd)>,
     // Build a NpuHalfDmaCpyNdOp with dynamic entries.
     OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection,
       "::mlir::Value":$input, "ValueRange":$offsets, "ValueRange":$sizes,
       "ValueRange":$strides, "::mlir::Value":$bd_id, "::mlir::Value":$channel,
-      "bool":$use_next_bd, "::mlir::Value":$next_bd, "::mlir::Value":$start_bd)>,
+      "::mlir::BoolAttr":$use_next_bd, "::mlir::Value":$next_bd, "::mlir::Value":$start_bd)>,
   ];
 
   let extraClassDeclaration = [{
@@ -687,6 +692,7 @@ def AMDAIE_NpuHalfDmaCpyNdOp
     }
 
     std::optional<BdIdOp> getBdIdOp() {
+      if (!getBdId()) return std::nullopt;
       return dyn_cast_if_present<BdIdOp>(getBdId().getDefiningOp());
     }
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir
index 059bccaab..1e9d82ec5 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir
@@ -397,6 +397,7 @@ func.func @npu_dma_cpy_nd_all_operands(%arg0: !amdaie.logicalobjectfifo<memref<1
 // CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
 // CHECK-DAG:   %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
 // CHECK-DAG:   %[[BD_ID:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
+// CHECK-DAG:   %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C1]])
 // CHECK-DAG:   %[[CHANNEL:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA, direction = S2MM)
 // CHECK-DAG:   %[[CONNECTION_0:.+]] = amdaie.connection
 func.func @npu_half_dma_cpy_nd(%arg0: !amdaie.logicalobjectfifo<memref<2048xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>>) {
@@ -404,18 +405,21 @@ func.func @npu_half_dma_cpy_nd(%arg0: !amdaie.logicalobjectfifo<memref<2048xi32>
   %c1 = arith.constant 1 : index
   %tile_0_0 = amdaie.tile(%c0, %c0)
   %bd_id = amdaie.bd_id(%tile_0_0, %c0)
+  %bd_id_1 = amdaie.bd_id(%tile_0_0, %c1)
   %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = S2MM)
   %0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>>)
-// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] [] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-  amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] [] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-// CHECK: %{{.+}} = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[ARG0]] [] [] [] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-  amdaie.npu.half_dma_cpy_nd async %0(%arg0[] [] [] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [0] [1024] [1] bd_id = %[[BD_ID]] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-  amdaie.npu.half_dma_cpy_nd %0(%arg0[0] [1024] [1] bd_id = %bd_id use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [%[[C0]], 0] [%[[C0]], 64] [%[[C0]], 1] channel = %[[CHANNEL]] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-  amdaie.npu.half_dma_cpy_nd %0(%arg0[%c0, 0] [%c0, 64] [%c0, 1] channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] []) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+  amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] []) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: %{{.+}} = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[ARG0]] [] [] []) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+  amdaie.npu.half_dma_cpy_nd async %0(%arg0[] [] []) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [0] [1024] [1] bd_id = %[[BD_ID]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+  amdaie.npu.half_dma_cpy_nd %0(%arg0[0] [1024] [1] bd_id = %bd_id) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [%[[C0]], 0] [%[[C0]], 64] [%[[C0]], 1] channel = %[[CHANNEL]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+  amdaie.npu.half_dma_cpy_nd %0(%arg0[%c0, 0] [%c0, 64] [%c0, 1] channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
 // CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] [] bd_id = %[[BD_ID]] channel = %[[CHANNEL]] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
   amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] [] bd_id = %bd_id channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] [] bd_id = %[[BD_ID]] channel = %[[CHANNEL]] use_next_bd = true next_bd = %[[BD_ID_1]] start_bd = %[[BD_ID]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+  amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] [] bd_id = %bd_id channel = %channel use_next_bd = true next_bd = %bd_id_1 start_bd = %bd_id) : !amdaie.logicalobjectfifo<memref<2048xi32>>
   return
 }
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp
index 6c25389a6..e55d413fd 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp
@@ -109,7 +109,7 @@ struct HalfDmaCpyNdToNpuConverter final
     staticStrides.insert(staticStrides.begin(),
                          numIntraAddrDim - staticStrides.size(), 0);
 
-    bool useNextBd = op.getUseNextBd();
+    bool useNextBd = op.getUseNextBd().value_or(false);
     int32_t nextBd{0};
     if (useNextBd) {
       std::optional<AMDAIE::BdIdOp> nextBdIdOp = op.getNextBdIdOp();
@@ -216,19 +216,18 @@ struct HalfDmaCpyNdToNpuConverter final
     if (failed(npuPushToQueueOp)) return failure();
     rewriter.replaceOp(op, *npuPushToQueueOp);
 
-    bool useNextBd = op.getUseNextBd();
-    if (useNextBd)
-      // Erase if not end of chain.
+    bool useNextBd = op.getUseNextBd().value_or(false);
+    if (useNextBd) {
+      // `useNextBd` is true, so either at the beginning or middle of a chain.
+      // No need to push to the queue, just erase the op.
       rewriter.eraseOp(*npuPushToQueueOp);
-    else {
+    } else {
       std::optional<AMDAIE::BdIdOp> maybeStartBdIdOp = op.getStartBdIdOp();
       if (maybeStartBdIdOp) {
-        // Update the BD ID with the start of the chain.
-        uint32_t startBdId =
-            getConstantIndexOrAssert(maybeStartBdIdOp.value().getValue());
-        uint32_t bdId =
-            getConstantIndexOrAssert(maybeBdIdOp.value().getValue());
-        if (startBdId != bdId) npuPushToQueueOp->setBdId(startBdId);
+        // Update with the BD ID at the start of the chain.
+        AMDAIE::BdIdOp startBdIdOp = maybeStartBdIdOp.value();
+        uint32_t startBdId = getConstantIndexOrAssert(startBdIdOp.getValue());
+        npuPushToQueueOp->setBdId(startBdId);
       }
     }
     return success();
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaComposition.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaComposition.cpp
index 46e93f88a..beec230f9 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaComposition.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaComposition.cpp
@@ -67,12 +67,6 @@ void AMDAIEDmaCompositionPass::runOnOperation() {
                                "after strided op composition";
     return signalPassFailure();
   }
-
-  if (failed(moveNpuSourceDmaSyncAfterTargetDmaCpy(rewriter, parentOp))) {
-    parentOp->emitOpError()
-        << "failed to move source DMA sync after target DMA copy";
-    return signalPassFailure();
-  }
 }
 
 }  // namespace
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp
index 1ec0608ee..11ddd5b60 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp
@@ -5,9 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "iree-amd-aie/IR/AMDAIEOps.h"
-#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h"
-#include "iree-amd-aie/Transforms/AMDAIEUtils.h"
 #include "iree-amd-aie/Transforms/Passes.h"
+#include "iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h"
+#include "iree-amd-aie/Transforms/Utils/AMDAIEUtils.h"
 #include "iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h"
 #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
 #include "mlir/IR/Iterators.h"
@@ -17,30 +17,88 @@ namespace mlir::iree_compiler::AMDAIE {
 
 namespace {
 
-LogicalResult insertDmaBdChain(AMDAIE::AMDAIEDeviceModel deviceModel,
-                               AMDAIE::WorkgroupOp workgroupOp) {
-  IRRewriter rewriter(workgroupOp->getContext());
-
-  // TODO(Zhewen): to get rid of tileArgIdxToAssignedBdIdOps and
-  // tileArgIdxToDmaCount, integrate BD ID assignment and (partial) control code
-  // loop unrolling into this pass.
-
-  // BD ID that are currenly assigned to DMA operations
-  DenseMap<std::pair<AMDAIE::TileOp, uint32_t>, SmallVector<AMDAIE::BdIdOp>>
-      tileArgIdxToAssignedBdIdOps;
-  // Counter for the number of DMA operations, helping determine the dependency
-  DenseMap<std::pair<AMDAIE::TileOp, uint32_t>, uint32_t> tileArgIdxToDmaCount;
+using TileConnect = std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>;
+
+/// Utility function to update `use_next_bd`, `next_bd` and `start_bd` operands.
+void updateChainOperands(IRRewriter &rewriter,
+                         SmallVector<AMDAIE::NpuHalfDmaCpyNdOp> &dmaChain) {
+  if (dmaChain.size() < 2) return;
+
+  // Chain the DMA ops.
+  Value startBdId = dmaChain[0].getBdId();
+  for (unsigned i = 0; i < dmaChain.size() - 1; ++i) {
+    AMDAIE::NpuHalfDmaCpyNdOp currDmaOp = dmaChain[i];
+    Value nextBd = dmaChain[i + 1].getBdId();
+    BoolAttr useNextBd = rewriter.getBoolAttr(true);
+    // No token is produced at the beginning or middle of a chain.
+    TypeRange token = TypeRange{};
+    rewriter.setInsertionPointAfter(currDmaOp);
+    rewriter.create<AMDAIE::NpuHalfDmaCpyNdOp>(
+        currDmaOp.getLoc(), token, currDmaOp.getConnection(),
+        currDmaOp.getInput(), currDmaOp.getMixedOffsets(),
+        currDmaOp.getMixedSizes(), currDmaOp.getMixedStrides(),
+        currDmaOp.getBdId(), currDmaOp.getChannel(), useNextBd, nextBd,
+        startBdId);
+    for (auto &use : currDmaOp->getUses()) {
+      rewriter.eraseOp(use.getOwner());
+    }
+    rewriter.eraseOp(currDmaOp);
+  }
+  // Last DMA op in the chain.
+  AMDAIE::NpuHalfDmaCpyNdOp lastDmaOp = dmaChain.back();
+  Value nextBd = nullptr;
+  BoolAttr useNextBd = rewriter.getBoolAttr(false);
+  rewriter.setInsertionPointAfter(lastDmaOp);
+  auto lastDmaOpChained = rewriter.create<AMDAIE::NpuHalfDmaCpyNdOp>(
+      lastDmaOp.getLoc(), lastDmaOp.getResultTypes(), lastDmaOp.getConnection(),
+      lastDmaOp.getInput(), lastDmaOp.getMixedOffsets(),
+      lastDmaOp.getMixedSizes(), lastDmaOp.getMixedStrides(),
+      lastDmaOp.getBdId(), lastDmaOp.getChannel(), useNextBd, nextBd,
+      startBdId);
+  rewriter.replaceOp(lastDmaOp, lastDmaOpChained.getResults());
+}
 
-  // Last DMA operation encountered, no matter if it is chained or not
-  DenseMap<std::pair<AMDAIE::TileOp, uint32_t>, AMDAIE::NpuHalfDmaCpyNdOp>
-      tileArgIdxToLastDmaOp;
-  // Last DMA operation that has been chained
-  DenseMap<std::pair<AMDAIE::TileOp, uint32_t>, AMDAIE::NpuHalfDmaCpyNdOp>
-      tileArgIdxToLastChainedDmaOp;
-  // Black list of tile argument index pairs that should not be chained
-  SmallVector<std::pair<AMDAIE::TileOp, uint32_t>> tileArgIdxsBlackList;
+/// Utility function to determine if chains can grow further
+/// or require breaking.
+///
+/// Example:
+/// - Chain X currently holds BD IDs: [4, 5, 6, 7]
+/// - Chain Y currently holds BD IDs: [0, 1, 2, 3]
+/// - A new BD ID (0) needs to be added to the front (due to reverse
+/// traversing) of chain X.
+///
+/// Conflict resolution:
+/// - Chain Y must be broken because BD ID 0 is already assigned to it
+/// and must be released.
+/// - Chain X is also broken to prevent the new added BD ID (0) from
+/// invalidating chain Y.
+///
+/// Result:
+/// - Break both chains X and Y.
+///   - Chain X: [0] (the newly added BD ID).
+///   - Chain Y: [] (emptied after breaking).
+void canChainGrowFurther(
+    const uint32_t bdId, const TileConnect &currTileConnect,
+    const DenseMap<TileConnect, SmallVector<uint32_t>> &tileConnectToBdIds,
+    SmallVector<TileConnect> &chainsToBreak) {
+  for (auto &[entry, bdIds] : tileConnectToBdIds) {
+    if (entry.first == currTileConnect.first &&
+        llvm::is_contained(bdIds, bdId)) {
+      // Break the chain that contains the duplicate BD ID.
+      chainsToBreak.push_back(entry);
+      if (entry != currTileConnect) {
+        // Break the current chain as well.
+        chainsToBreak.push_back(currTileConnect);
+      }
+      break;
+    }
+  }
+}
 
-  AMDAIE::ControlCodeOp controlCodeOp = workgroupOp.getControlCode();
+/// Traverse the control code in reverse order to create DMA BD chains.
+LogicalResult insertDmaBdChain(const AMDAIE::AMDAIEDeviceModel &deviceModel,
+                               AMDAIE::ControlCodeOp controlCodeOp) {
+  IRRewriter rewriter(controlCodeOp->getContext());
 
   // Move all BdIdOps to the beginning of the control code.
   // This is to avoid dominance issues when chaining BD IDs.
@@ -55,254 +113,106 @@ LogicalResult insertDmaBdChain(AMDAIE::AMDAIEDeviceModel deviceModel,
     op->moveBefore(&controlCodeOp.front());
   }
 
-  // Find `NpuHalfDmaCpyNdOp` operations and chain BD IDs.
-  res = controlCodeOp->walk([&](Operation *op) {
-    if (auto npuHalfDmaCpyNdOp = dyn_cast<AMDAIE::NpuHalfDmaCpyNdOp>(op)) {
-      // not shim, no need to chain, since it will be earsed when lowering to
-      // NPU instructions
-      if (npuHalfDmaCpyNdOp.getMemorySpaceAsUInt() != 0) {
-        return WalkResult::advance();
-      }
-
-      bool chaining = true;
-      // packet mode is enabled, do not chain BDs
-      std::optional<AMDAIE::ConnectionOp> maybeConnectionOp =
-          npuHalfDmaCpyNdOp.getConnectionOp();
-      if (!maybeConnectionOp) {
-        npuHalfDmaCpyNdOp.emitOpError()
-            << "expected to operate on an `amdaie.connection`";
-        return WalkResult::interrupt();
-      }
-      std::optional<AMDAIE::FlowOp> maybeFlowOp =
-          maybeConnectionOp->getFlowOp();
-      if (!maybeFlowOp) {
-        maybeConnectionOp->emitOpError()
-            << "expected to operate on an `amdaie.flow`";
-        return WalkResult::interrupt();
-      }
-      bool enablePacket = maybeFlowOp->getIsPacketFlow();
-      if (enablePacket) {
-        chaining = false;
-      }
-
-      // repeat count > 1, do not chain BDs
-      int32_t repeatCount = 1;
-      uint8_t numIntraAddrDim = deviceModel.getDmaProp<uint8_t>(
-          AMDAIE::AMDAIETileType::SHIMNOC, AMDAIE::AMDAIEDmaProp::NumAddrDim);
-      uint8_t numAddrDim = numIntraAddrDim + kAMDAIEDmaNbInterDims;
-      auto sizes = npuHalfDmaCpyNdOp.getMixedSizes();
-      auto strides = npuHalfDmaCpyNdOp.getMixedStrides();
-      if (!sizes.empty() && !strides.empty()) {
-        int64_t size = getConstantIndexOrAssert(sizes[0]);
-        int64_t stride = getConstantIndexOrAssert(strides[0]);
-        if (sizes.size() == numAddrDim || stride == 0) {
-          repeatCount = size;
-        }
-      }
-      if (repeatCount > 1) {
-        chaining = false;
-      }
-
-      // get current BD ID and tile
-      std::optional<AMDAIE::BdIdOp> maybeBdIdOp = npuHalfDmaCpyNdOp.getBdIdOp();
-      if (!maybeBdIdOp) {
-        npuHalfDmaCpyNdOp.emitOpError() << "must have a BD ID op";
-        return WalkResult::interrupt();
-      }
-      AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value();
-      AMDAIE::TileOp tileOp =
-          dyn_cast_if_present<AMDAIE::TileOp>(bdIdOp.getTile().getDefiningOp());
-      if (!tileOp) {
-        bdIdOp.emitOpError() << "must operate on an `amdaie.tile`";
-        return WalkResult::interrupt();
-      }
-
-      // get arg index
-      auto logicalObjFifo =
-          dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
-              npuHalfDmaCpyNdOp.getInput().getDefiningOp());
-      if (!logicalObjFifo) {
-        npuHalfDmaCpyNdOp.emitOpError()
-            << "expected input to be an "
-               "`amdaie.logicalobjectfifo.from_memref`";
-        return WalkResult::interrupt();
-      }
-      auto subspanOp =
-          dyn_cast_if_present<IREE::HAL::InterfaceBindingSubspanOp>(
-              logicalObjFifo.getMemref().getDefiningOp());
-      if (!subspanOp) {
-        logicalObjFifo.emitOpError()
-            << "must operate on an `hal.interface.binding.subspan`";
-        return WalkResult::interrupt();
-      }
-      uint32_t argIdx = subspanOp.getBinding().getZExtValue();
-
-      // If the current DMA operation was previously part of the outer loop in
-      // the control code, force all DMA operations in the inner loop to be
-      // synchronized, by adding them to the black list.
-      tileArgIdxToDmaCount[{tileOp, argIdx}]++;
-      for (auto &[pair, count] : tileArgIdxToDmaCount) {
-        if (pair.first == tileOp &&
-            count > tileArgIdxToDmaCount[{tileOp, argIdx}] + 1) {
-          if (!llvm::is_contained(tileArgIdxsBlackList, pair)) {
-            tileArgIdxsBlackList.push_back(pair);
+  // BD ID that are have been assigned in each tile.
+  DenseMap<TileConnect, SmallVector<uint32_t>> tileConnectToBdIds;
+  // Buffers the DMA ops that will be chained.
+  DenseMap<TileConnect, SmallVector<AMDAIE::NpuHalfDmaCpyNdOp>>
+      tileConnectToDmaChain;
+
+  res = controlCodeOp->walk<WalkOrder::PostOrder, ReverseIterator>(
+      [&](Operation *op) {
+        if (auto npuHalfDmaCpyNdOp = dyn_cast<AMDAIE::NpuHalfDmaCpyNdOp>(op)) {
+          // Not shim, will be earsed at ControlcodeLowering, ignore.
+          if (npuHalfDmaCpyNdOp.getMemorySpaceAsUInt() != 0) {
+            return WalkResult::advance();
           }
-        }
-      }
 
-      // If the BD ID is currently used by another DMA op, stop the chain
-      // for that DMA op from further growing, by adding it to the black list
-      for (auto &[pair, bdIdOps] : tileArgIdxToAssignedBdIdOps) {
-        if (pair.first == tileOp && llvm::is_contained(bdIdOps, bdIdOp)) {
-          if (!llvm::is_contained(tileArgIdxsBlackList, pair)) {
-            tileArgIdxsBlackList.push_back(pair);
+          // Get the connection op.
+          std::optional<AMDAIE::ConnectionOp> maybeConnectionOp =
+              npuHalfDmaCpyNdOp.getConnectionOp();
+          if (!maybeConnectionOp) {
+            npuHalfDmaCpyNdOp.emitOpError()
+                << "expected to operate on an `amdaie.connection`";
+            return WalkResult::interrupt();
           }
-          break;
-        }
-      }
-
-      // If the black list is not empty, there will be a synchronization.
-      // Make sure all other DMA chains also break at this point to avoid
-      // dependency issues.
-      if (tileArgIdxsBlackList.size() > 0) {
-        for (auto &[pair, bdIdOps] : tileArgIdxToAssignedBdIdOps) {
-          if (pair.first == tileOp && bdIdOps.size() > 1) {
-            if (!llvm::is_contained(tileArgIdxsBlackList, pair)) {
-              tileArgIdxsBlackList.push_back(pair);
+          AMDAIE::ConnectionOp connectionOp = maybeConnectionOp.value();
+
+          // Packet flow, do not chain BDs.
+          std::optional<AMDAIE::FlowOp> maybeFlowOp = connectionOp.getFlowOp();
+          if (!maybeFlowOp) {
+            connectionOp->emitOpError()
+                << "expected to operate on an `amdaie.flow`";
+            return WalkResult::interrupt();
+          }
+          AMDAIE::FlowOp flowOp = maybeFlowOp.value();
+          bool isPacketFlow = flowOp.getIsPacketFlow();
+          if (isPacketFlow) return WalkResult::advance();
+
+          // Repeat count > 1, do not chain BDs.
+          int32_t repeatCount = 1;
+          uint8_t numIntraAddrDim = deviceModel.getDmaProp<uint8_t>(
+              AMDAIE::AMDAIETileType::SHIMNOC,
+              AMDAIE::AMDAIEDmaProp::NumAddrDim);
+          uint8_t numAddrDim = numIntraAddrDim + kAMDAIEDmaNbInterDims;
+          auto sizes = npuHalfDmaCpyNdOp.getMixedSizes();
+          auto strides = npuHalfDmaCpyNdOp.getMixedStrides();
+          if (!sizes.empty() && !strides.empty()) {
+            int64_t size = getConstantIndexOrAssert(sizes[0]);
+            int64_t stride = getConstantIndexOrAssert(strides[0]);
+            if (sizes.size() == numAddrDim || stride == 0) {
+              repeatCount = size;
             }
           }
-        }
-      }
-
-      // When current DMA has not been blacklisted and a previous DMA with same
-      // argIdx exists, chain them together
-      chaining &= !llvm::is_contained(tileArgIdxsBlackList,
-                                      std::make_pair(tileOp, argIdx)) &&
-                  tileArgIdxToLastDmaOp.contains({tileOp, argIdx});
-      if (chaining) {
-        // update the previous DMA op by changing its useNextBd and
-        // nextBd
-        AMDAIE::NpuHalfDmaCpyNdOp lastDmaOp =
-            tileArgIdxToLastDmaOp[{tileOp, argIdx}];
-        rewriter.setInsertionPointAfter(lastDmaOp);
-        auto chainedDmaOp = rewriter.create<AMDAIE::NpuHalfDmaCpyNdOp>(
-            lastDmaOp.getLoc(), lastDmaOp.getResultTypes(),
-            lastDmaOp.getConnection(), lastDmaOp.getInput(),
-            lastDmaOp.getMixedOffsets(), lastDmaOp.getMixedSizes(),
-            lastDmaOp.getMixedStrides(), lastDmaOp.getBdId(),
-            lastDmaOp.getChannel(), true, bdIdOp, lastDmaOp.getStartBd());
-        rewriter.replaceOp(lastDmaOp, chainedDmaOp.getResults());
-        tileArgIdxToLastChainedDmaOp[{tileOp, argIdx}] = chainedDmaOp;
-        // update the current DMA op by changing its startBd
-        rewriter.setInsertionPoint(npuHalfDmaCpyNdOp);
-        auto npuHalfDmaCpyNdOpNew = rewriter.create<AMDAIE::NpuHalfDmaCpyNdOp>(
-            npuHalfDmaCpyNdOp.getLoc(), npuHalfDmaCpyNdOp.getResultTypes(),
-            npuHalfDmaCpyNdOp.getConnection(), npuHalfDmaCpyNdOp.getInput(),
-            npuHalfDmaCpyNdOp.getMixedOffsets(),
-            npuHalfDmaCpyNdOp.getMixedSizes(),
-            npuHalfDmaCpyNdOp.getMixedStrides(), npuHalfDmaCpyNdOp.getBdId(),
-            npuHalfDmaCpyNdOp.getChannel(), npuHalfDmaCpyNdOp.getUseNextBd(),
-            npuHalfDmaCpyNdOp.getNextBd(), chainedDmaOp.getStartBd());
-        rewriter.replaceOp(npuHalfDmaCpyNdOp,
-                           npuHalfDmaCpyNdOpNew.getResults());
-        npuHalfDmaCpyNdOp = npuHalfDmaCpyNdOpNew;
-      }
-
-      // Update BD ID assignment, if it is chaining, safely release the BD IDs
-      // since a synchronization will happen
-      if (chaining && tileArgIdxToAssignedBdIdOps.contains({tileOp, argIdx})) {
-        tileArgIdxToAssignedBdIdOps[{tileOp, argIdx}].push_back(bdIdOp);
-      } else {
-        tileArgIdxToAssignedBdIdOps[{tileOp, argIdx}] = {bdIdOp};
-      }
-
-      // The current DMA op is not chained with the previous DMA op (i.e.
-      // synchroizaiton will happen between these two ops), removing from the
-      // black list
-      if (!chaining) {
-        auto it =
-            std::find(tileArgIdxsBlackList.begin(), tileArgIdxsBlackList.end(),
-                      std::make_pair(tileOp, argIdx));
-        if (it != tileArgIdxsBlackList.end()) {
-          tileArgIdxsBlackList.erase(it);
-        }
-      }
-      // Update the last encountered DMA op
-      tileArgIdxToLastDmaOp[{tileOp, argIdx}] = npuHalfDmaCpyNdOp;
+          if (repeatCount > 1) return WalkResult::advance();
+
+          // Get the BD ID and tile op.
+          std::optional<AMDAIE::BdIdOp> maybeBdIdOp =
+              npuHalfDmaCpyNdOp.getBdIdOp();
+          if (!maybeBdIdOp) {
+            npuHalfDmaCpyNdOp.emitOpError() << "must have a BD ID op";
+            return WalkResult::interrupt();
+          }
+          AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value();
+          uint32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue());
+          AMDAIE::TileOp tileOp = dyn_cast_if_present<AMDAIE::TileOp>(
+              bdIdOp.getTile().getDefiningOp());
+          if (!tileOp) {
+            bdIdOp.emitOpError() << "must operate on an `amdaie.tile`";
+            return WalkResult::interrupt();
+          }
 
-    } else if (auto npuDmaWaitOp = dyn_cast<AMDAIE::NpuDmaWaitOp>(op)) {
-      // Handle the special case where there are multiple DMA ops preceding any
-      // Wait op. In such a case, some DMA ops may be chained first, before they
-      // are put onto the black list. Therefore, go over the black list and
-      // unchain the DMA ops when required.
+          // Any duplicate BD ID from the same tile indicates the chain cannot
+          // grow further and requires breaking to release the conflicting BD
+          // ID.
+          SmallVector<TileConnect> chainsToBreak;
+          TileConnect currTileConnect = {tileOp, connectionOp};
+          canChainGrowFurther(bdId, currTileConnect, tileConnectToBdIds,
+                              chainsToBreak);
+
+          // If the chains are not to be continued, update DMA operands using
+          // the `updateChainOperands` function.
+          if (!chainsToBreak.empty()) {
+            for (auto &entry : chainsToBreak) {
+              updateChainOperands(rewriter, tileConnectToDmaChain[entry]);
+              tileConnectToBdIds[entry].clear();
+              tileConnectToDmaChain[entry].clear();
+            }
+          }
 
-      for (auto &[tileOp, argIdx] : tileArgIdxsBlackList) {
-        if (tileArgIdxToLastChainedDmaOp.contains({tileOp, argIdx}) &&
-            tileArgIdxToLastDmaOp.contains({tileOp, argIdx})) {
-          // break the chain lastChainedDmaOp -> lastDmaOp
-          AMDAIE::NpuHalfDmaCpyNdOp lastChainedDmaOp =
-              tileArgIdxToLastChainedDmaOp[{tileOp, argIdx}];
-          AMDAIE::NpuHalfDmaCpyNdOp lastDmaOp =
-              tileArgIdxToLastDmaOp[{tileOp, argIdx}];
-          // revert useNextBd and nextBd in lastChainedDmaOp
-          bool useNextBd{false};
-          Value nextBd{nullptr};
-          rewriter.setInsertionPointAfter(lastChainedDmaOp);
-          auto unchainedDmaOp = rewriter.create<AMDAIE::NpuHalfDmaCpyNdOp>(
-              lastChainedDmaOp.getLoc(), lastChainedDmaOp.getResultTypes(),
-              lastChainedDmaOp.getConnection(), lastChainedDmaOp.getInput(),
-              lastChainedDmaOp.getMixedOffsets(),
-              lastChainedDmaOp.getMixedSizes(),
-              lastChainedDmaOp.getMixedStrides(), lastChainedDmaOp.getBdId(),
-              lastChainedDmaOp.getChannel(), useNextBd, nextBd,
-              lastChainedDmaOp.getStartBd());
-          rewriter.replaceOp(lastChainedDmaOp, unchainedDmaOp.getResults());
-          tileArgIdxToLastChainedDmaOp.erase({tileOp, argIdx});
-          // revert startBd in lastDmaOp
-          auto startBd = lastDmaOp.getBdId();
-          rewriter.setInsertionPoint(lastDmaOp);
-          unchainedDmaOp = rewriter.create<AMDAIE::NpuHalfDmaCpyNdOp>(
-              lastDmaOp.getLoc(), lastDmaOp.getResultTypes(),
-              lastDmaOp.getConnection(), lastDmaOp.getInput(),
-              lastDmaOp.getMixedOffsets(), lastDmaOp.getMixedSizes(),
-              lastDmaOp.getMixedStrides(), lastDmaOp.getBdId(),
-              lastDmaOp.getChannel(), lastDmaOp.getUseNextBd(),
-              lastDmaOp.getNextBd(), startBd);
-          tileArgIdxToAssignedBdIdOps[{tileOp, argIdx}] = {
-              lastDmaOp.getBdIdOp().value()};
-          rewriter.replaceOp(lastDmaOp, unchainedDmaOp.getResults());
-          tileArgIdxToLastDmaOp[{tileOp, argIdx}] = unchainedDmaOp;
-        } else {
-          npuDmaWaitOp.emitError() << "unhandled situation in DMA BD chaining, "
-                                      "please try to disable this pass";
-          return WalkResult::interrupt();
+          // Insert at the front, as we are walking in reverse order.
+          tileConnectToBdIds[currTileConnect].insert(
+              tileConnectToBdIds[currTileConnect].begin(), bdId);
+          tileConnectToDmaChain[currTileConnect].insert(
+              tileConnectToDmaChain[currTileConnect].begin(),
+              npuHalfDmaCpyNdOp);
         }
-      }
-
-      tileArgIdxsBlackList.clear();
-    }
-    return WalkResult::advance();
-  });
+        return WalkResult::advance();
+      });
 
-  // Only keep DMA Wait Ops if at the end of a chain, erase others
-  res = controlCodeOp->walk([&](Operation *op) {
-    if (auto npuDmaWaitOp = dyn_cast<AMDAIE::NpuDmaWaitOp>(op)) {
-      bool toErase = true;
-      for (Value token : npuDmaWaitOp.getAsyncTokens()) {
-        auto npuHalfDmaCpyNdOp = dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(
-            token.getDefiningOp());
-        bool chaining = npuHalfDmaCpyNdOp && npuHalfDmaCpyNdOp.getUseNextBd();
-        if (!chaining) {
-          toErase = false;
-          break;
-        }
-      }
-      if (toErase) {
-        rewriter.eraseOp(npuDmaWaitOp);
-      }
-    }
-    return WalkResult::advance();
-  });
+  // Build the remaining chains.
+  for (auto &[entry, _] : tileConnectToBdIds) {
+    updateChainOperands(rewriter, tileConnectToDmaChain[entry]);
+  }
 
   if (res.wasInterrupted()) return failure();
   return success();
@@ -336,7 +246,8 @@ void AMDAIEInsertDmaBdChainPass::runOnOperation() {
       AMDAIE::getDeviceModel(maybeDevice.value());
 
   WalkResult res = parentOp->walk([&](AMDAIE::WorkgroupOp workgroupOp) {
-    if (failed(insertDmaBdChain(deviceModel, workgroupOp))) {
+    AMDAIE::ControlCodeOp controlCodeOp = workgroupOp.getControlCode();
+    if (failed(insertDmaBdChain(deviceModel, controlCodeOp))) {
       return WalkResult::interrupt();
     }
     return WalkResult::advance();
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIENpuDmaToHalfDmaCpyNd.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIENpuDmaToHalfDmaCpyNd.cpp
index a58d836cf..4aa9c6928 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIENpuDmaToHalfDmaCpyNd.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIENpuDmaToHalfDmaCpyNd.cpp
@@ -31,7 +31,7 @@ struct NpuDmaToHalfDmaCpyNdConverter final
       return dmaOp.emitOpError()
              << "should operate on an `amdaie.connection` op";
     }
-    bool useNextBd{false};
+    BoolAttr useNextBd = rewriter.getBoolAttr(false);
     Value nextBd{nullptr};
     // Convert source half.
     Value source =
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
index d10e20611..afd92e4cd 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
@@ -37,7 +37,6 @@ namespace mlir::iree_compiler::AMDAIE {
 #define GEN_PASS_DEF_AMDAIECONNECTIONTOFLOW
 #define GEN_PASS_DEF_AMDAIECONTROLCODEFORALLTOFOR
 #define GEN_PASS_DEF_AMDAIECONTROLCODELOOPUNROLL
-#define GEN_PASS_DEF_AMDAIECONTROLCODETOHALFDMACPYND
 #define GEN_PASS_DEF_AMDAIECONTROLCODELOWERING
 #define GEN_PASS_DEF_AMDAIECONTROLCODETOTRANSACTION
 #define GEN_PASS_DEF_AMDAIECONVERTCOREFORALLTOFOR
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
index 87d02e260..72c075465 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
@@ -171,7 +171,6 @@ def AMDAIEControlCodeLowering :
   let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEControlCodeLoweringPass()";
 }
 
-
 def AMDAIEControlCodeToTransaction :
     Pass<"iree-amdaie-controlcode-to-transaction", ""> {
   let summary = "Convert controlcode instructions into a NPU instruction transaction.";
@@ -233,6 +232,7 @@ def AMDAIEDistributeL1Allocations :
   let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEDistributeL1AllocationsPass()";
 }
 
+
 def AMDAIEDmaComposition :
   Pass<"iree-amdaie-dma-composition"> {
   let summary = "Compose DMA operations by DMA combination and loop subsumption.";
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.cpp
index 94476b085..9c2e43c6e 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.cpp
@@ -483,55 +483,4 @@ LogicalResult moveNpuDmaSyncUsersAfterAncestorInSameBlock(
   return success();
 }
 
-// Move NPU DMA wait operations with async_source tokens as late as possible
-// (after the target DMA wait operation which has async_target token) This is to
-// help later optimizations such as DMA BD chaining. Example:
-//
-// %0 = dma_cpy_nd async_source
-// dma_wait(%0 : !amdaie.async_source_token)
-// %1 = dma_cpy_nd async_source
-// dma_wait(%1 : !amdaie.async_source_token)
-// %2 = dma_cpy_nd async_target
-// dma_wait(%2 : !amdaie.async_target_token)
-// ------------------------------->>>>>>>>>>
-// %0 = dma_cpy_nd async_source
-// %1 = dma_cpy_nd async_source
-// %2 = dma_cpy_nd async_target
-// dma_wait(%2 : !amdaie.async_target_token)
-// dma_wait(%0 : !amdaie.async_source_token)
-// dma_wait(%1 : !amdaie.async_source_token)
-
-LogicalResult moveNpuSourceDmaSyncAfterTargetDmaCpy(RewriterBase &rewriter,
-                                                    Operation *parentOp) {
-  // Stores NPU source DMA wait operations to be moved later.
-  SmallVector<AMDAIE::NpuDmaWaitOp> npuSourceDmaWaitOps;
-
-  WalkResult res = parentOp->walk([&](Operation *op) {
-    if (auto npuDmaWaitOp = dyn_cast<AMDAIE::NpuDmaWaitOp>(op)) {
-      // Check if the DMA wait operation contains an async target token.
-      bool hasAsyncTargetToken =
-          llvm::any_of(npuDmaWaitOp.getAsyncTokens(), [](Value token) {
-            return isa<AMDAIE::AsyncTargetTokenType>(token.getType());
-          });
-      if (!hasAsyncTargetToken) {
-        npuSourceDmaWaitOps.push_back(npuDmaWaitOp);
-      } else {
-        // Move all collected NPU source DMA wait ops after the current target
-        // DMA wait op, but only if they belong to the same block.
-        for (auto &npuSourceDmaWaitOp : npuSourceDmaWaitOps) {
-          if (npuSourceDmaWaitOp->getBlock() == npuDmaWaitOp->getBlock()) {
-            rewriter.moveOpAfter(npuSourceDmaWaitOp, npuDmaWaitOp);
-          }
-        }
-        // Clear the list after moving.
-        npuSourceDmaWaitOps.clear();
-      }
-    }
-    return WalkResult::advance();
-  });
-
-  if (res.wasInterrupted()) return failure();
-  return success();
-}
-
 }  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h
index 8dc53b152..e4dbfd36b 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h
@@ -344,10 +344,6 @@ struct DmaDimConfig {
 LogicalResult moveNpuDmaSyncUsersAfterAncestorInSameBlock(
     RewriterBase &rewriter, Operation *parentOp);
 
-/// Utility to move the source dma synchronization after the target dma copy.
-LogicalResult moveNpuSourceDmaSyncAfterTargetDmaCpy(RewriterBase &rewriter,
-                                                    Operation *parentOp);
-
 }  // namespace mlir::iree_compiler::AMDAIE
 
 #endif
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir
index 4e1346ae9..63b163034 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir
@@ -58,18 +58,18 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = false, valid_bd = true}
 // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
 // CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
-        amdaie.npu.half_dma_cpy_nd %4(%5[] [] [] bd_id = %bd_id channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.half_dma_cpy_nd %4(%5[] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
 // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 2048 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 2048>, strides = array<i32: 0, 0, 1>, use_next_bd = false, valid_bd = true}
 // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
 // CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
 // CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
-        %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
         amdaie.npu.dma_wait(%6 : !amdaie.async_token)
 // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 16>, strides = array<i32: 64, 8, 1>, use_next_bd = false, valid_bd = true}
 // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
 // CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
 // CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token)
-        %7 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %7 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
         amdaie.npu.dma_wait(%7 : !amdaie.async_token)
         amdaie.end
       }
@@ -109,18 +109,18 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = false, valid_bd = true}
 // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
 // CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
-        amdaie.npu.half_dma_cpy_nd %4(%5[] [] [] bd_id = %bd_id channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xbf16>>
+        amdaie.npu.half_dma_cpy_nd %4(%5[] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xbf16>>
 // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 1024>, strides = array<i32: 0, 0, 1>, use_next_bd = false, valid_bd = true}
 // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
 // CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
 // CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
-        %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xbf16>>
+        %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xbf16>>
         amdaie.npu.dma_wait(%6 : !amdaie.async_token)
 // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 512 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 8>, strides = array<i32: 32, 4, 1>, use_next_bd = false, valid_bd = true}
 // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 64 : ui32}
 // CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
 // CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token)
-        %7 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 32] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xbf16>>
+        %7 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 32] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xbf16>>
         amdaie.npu.dma_wait(%7 : !amdaie.async_token)
         amdaie.end
       }
@@ -128,3 +128,53 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     return
   }
 }
+
+// -----
+
+// CHECK-LABEL: @half_npu_dma_cpy_nd_chain
+// CHECK:       amdaie.controlcode
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @half_npu_dma_cpy_nd_chain() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    amdaie.workgroup {
+      %tile = amdaie.tile(%c0, %c1)
+      %tile_0 = amdaie.tile(%c0, %c0)
+      %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
+      %lock = amdaie.lock(%tile(4), 4)
+      %lock_2 = amdaie.lock(%tile(5), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S)
+      %channel_3 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM)
+      %3 = amdaie.flow({%channel} -> {%channel_3}) {is_packet_flow = false}
+      %4 = amdaie.connection(%0 {%channel_3}, %2 {%channel}, flow = %3) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      amdaie.controlcode {
+        %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %1, 64 : memref<64x32xi32>
+        %bd_id = amdaie.bd_id(%tile_0, %c0)
+        %bd_id_1 = amdaie.bd_id(%tile_0, %c1)
+        %bd_id_2 = amdaie.bd_id(%tile_0, %c2)
+// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 1 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = true, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
+        amdaie.npu.half_dma_cpy_nd %4(%5[] [] [] bd_id = %bd_id channel = %channel use_next_bd = true next_bd = %bd_id_1 start_bd = %bd_id) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: amdaie.npu.write_bd {bd_id = 1 : ui32, buffer_length = 2048 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 2 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 2048>, strides = array<i32: 0, 0, 1>, use_next_bd = true, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 1 : ui32, col = 0 : ui32, offset = 0 : ui32}
+        amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id_1 channel = %channel use_next_bd = true next_bd = %bd_id_2 start_bd = %bd_id) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: amdaie.npu.write_bd {bd_id = 2 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 16>, strides = array<i32: 64, 8, 1>, use_next_bd = false, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 2 : ui32, col = 0 : ui32, offset = 0 : ui32}
+// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
+// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
+        %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id_2 channel = %channel use_next_bd = false start_bd = %bd_id) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%6 : !amdaie.async_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_dma_bd_chain.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_dma_bd_chain.mlir
index 6bf9c6161..4394718ad 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_dma_bd_chain.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_dma_bd_chain.mlir
@@ -1,14 +1,15 @@
 // RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-insert-dma-bd-chain)" --split-input-file --verify-diagnostics %s | FileCheck %s
 
 // CHECK-LABEL: @single_bd_chain
-// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id
-// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id
-// CHECK: %[[CHANNEL:.+]] = amdaie.channel
-// CHECK: %[[CONNECTION:.+]] = amdaie.connection
-// CHECK: %[[OBJECT_FIFO:.+]] = amdaie.logicalobjectfifo.from_memref
-// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL]] use_next_bd = true  next_bd = %[[BD_ID_1]] start_bd = %[[BD_ID_0]])
-// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL]] use_next_bd = false start_bd = %[[BD_ID_0]])
-// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token)
+// CHECK:       %[[CHANNEL:.+]] = amdaie.channel
+// CHECK:       %[[CONNECTION:.+]] = amdaie.connection
+// CHECK:       amdaie.controlcode
+// CHECK:         %[[BD_ID_0:.+]] = amdaie.bd_id
+// CHECK:         %[[BD_ID_1:.+]] = amdaie.bd_id
+// CHECK:         %[[OBJECT_FIFO:.+]] = amdaie.logicalobjectfifo.from_memref
+// CHECK:         amdaie.npu.half_dma_cpy_nd  %[[CONNECTION]](%[[OBJECT_FIFO]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL]] use_next_bd = true  next_bd = %[[BD_ID_1]] start_bd = %[[BD_ID_0]])
+// CHECK:         %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL]] use_next_bd = false start_bd = %[[BD_ID_0]])
+// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
 
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
@@ -19,8 +20,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     amdaie.workgroup {
       %tile = amdaie.tile(%c0, %c0)
       %tile_0 = amdaie.tile(%c0, %c1)
-      %bd_id = amdaie.bd_id(%tile, 0)
-      %bd_id_1 = amdaie.bd_id(%tile, 1)
       %buffer = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32>
       %buffer_2 = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32>
       %lock = amdaie.lock(%tile_0(0), 0)
@@ -35,8 +34,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       amdaie.controlcode {
         memref.assume_alignment %0, 64 : memref<512x512xbf16>
         %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<512x512xbf16> -> !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        %bd_id = amdaie.bd_id(%tile, %c0)
         %6 = amdaie.npu.half_dma_cpy_nd async %4(%5 [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %bd_id channel = %channel use_next_bd = false start_bd = %bd_id) : !amdaie.logicalobjectfifo<memref<262144xbf16>>
         amdaie.npu.dma_wait(%6 : !amdaie.async_token)
+        %bd_id_1 = amdaie.bd_id(%tile, %c1)
         %7 = amdaie.npu.half_dma_cpy_nd async %4(%5 [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %bd_id_1 channel = %channel use_next_bd = false start_bd = %bd_id_1) : !amdaie.logicalobjectfifo<memref<262144xbf16>>
         amdaie.npu.dma_wait(%7 : !amdaie.async_token)
         amdaie.end
@@ -49,37 +50,36 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // -----
 
 // CHECK-LABEL: @two_bd_chain
-// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id
-// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id
-// CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id
-// CHECK: %[[BD_ID_3:.+]] = amdaie.bd_id
-// CHECK: %[[CHANNEL_0:.+]] = amdaie.channel
-// CHECK: %[[CHANNEL_1:.+]] = amdaie.channel
-// CHECK: %[[CHANNEL_2:.+]] = amdaie.channel
-// CHECK: %[[CHANNEL_3:.+]] = amdaie.channel
-// CHECK: %[[CONNECTION_0:.+]] = amdaie.connection
-// CHECK: %[[CONNECTION_1:.+]] = amdaie.connection
-// CHECK: %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_memref
-// CHECK: %[[OBJECT_FIFO_1:.+]] = amdaie.logicalobjectfifo.from_memref
-// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[OBJECT_FIFO_0]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL_0]] use_next_bd = true next_bd = %[[BD_ID_2]] start_bd = %[[BD_ID_0]])
-// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_1]](%[[OBJECT_FIFO_1]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL_2]] use_next_bd = true next_bd = %[[BD_ID_3]] start_bd = %[[BD_ID_1]])
-// CHECK: %[[TOKEN_2:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[OBJECT_FIFO_0]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_2]] channel = %[[CHANNEL_0]] use_next_bd = false start_bd = %[[BD_ID_0]]) 
-// CHECK: %[[TOKEN_3:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_1]](%[[OBJECT_FIFO_1]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_3]] channel = %[[CHANNEL_2]] use_next_bd = false start_bd = %[[BD_ID_1]])
-// CHECK: amdaie.npu.dma_wait(%[[TOKEN_2]] : !amdaie.async_token)
-// CHECK: amdaie.npu.dma_wait(%[[TOKEN_3]] : !amdaie.async_token)
+// CHECK:       %[[CHANNEL_0:.+]] = amdaie.channel
+// CHECK:       %[[CHANNEL_1:.+]] = amdaie.channel
+// CHECK:       %[[CHANNEL_2:.+]] = amdaie.channel
+// CHECK:       %[[CHANNEL_3:.+]] = amdaie.channel
+// CHECK:       %[[CONNECTION_0:.+]] = amdaie.connection
+// CHECK:       %[[CONNECTION_1:.+]] = amdaie.connection
+// CHECK:       amdaie.controlcode
+// CHECK:         %[[BD_ID_0:.+]] = amdaie.bd_id
+// CHECK:         %[[BD_ID_1:.+]] = amdaie.bd_id
+// CHECK:         %[[BD_ID_2:.+]] = amdaie.bd_id
+// CHECK:         %[[BD_ID_3:.+]] = amdaie.bd_id
+// CHECK:         %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_memref
+// CHECK:         %[[OBJECT_FIFO_1:.+]] = amdaie.logicalobjectfifo.from_memref
+// CHECK:         amdaie.npu.half_dma_cpy_nd  %[[CONNECTION_0]](%[[OBJECT_FIFO_0]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL_0]] use_next_bd = true next_bd = %[[BD_ID_2]] start_bd = %[[BD_ID_0]])
+// CHECK:         amdaie.npu.half_dma_cpy_nd  %[[CONNECTION_1]](%[[OBJECT_FIFO_1]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL_2]] use_next_bd = true next_bd = %[[BD_ID_3]] start_bd = %[[BD_ID_1]])
+// CHECK:         %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[OBJECT_FIFO_0]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_2]] channel = %[[CHANNEL_0]] use_next_bd = false start_bd = %[[BD_ID_0]]) 
+// CHECK:         %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_1]](%[[OBJECT_FIFO_1]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_3]] channel = %[[CHANNEL_2]] use_next_bd = false start_bd = %[[BD_ID_1]])
+// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
+// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token)
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @two_bd_chain() {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
     amdaie.workgroup {
       %tile = amdaie.tile(%c0, %c0)
       %tile_0 = amdaie.tile(%c0, %c1)
-      %bd_id = amdaie.bd_id(%tile, 0)
-      %bd_id_1 = amdaie.bd_id(%tile, 1)
-      %bd_id_2 = amdaie.bd_id(%tile, 2)
-      %bd_id_3 = amdaie.bd_id(%tile, 3)
       %buffer = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32>
       %buffer_4 = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32>
       %buffer_5 = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32>
@@ -106,11 +106,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
         memref.assume_alignment %0, 64 : memref<512x512xbf16>
         %10 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<512x512xbf16> -> !amdaie.logicalobjectfifo<memref<262144xbf16>>
         %11 = amdaie.logicalobjectfifo.from_memref %1, {%tile} : memref<512x512xbf16> -> !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        %bd_id = amdaie.bd_id(%tile, %c0)
         %12 = amdaie.npu.half_dma_cpy_nd async %5(%10 [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %bd_id channel = %channel use_next_bd = false start_bd = %bd_id) : !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        %bd_id_1 = amdaie.bd_id(%tile, %c1)
         %13 = amdaie.npu.half_dma_cpy_nd async %9(%11 [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %bd_id_1 channel = %channel_11 use_next_bd = false start_bd = %bd_id_1) : !amdaie.logicalobjectfifo<memref<262144xbf16>>
         amdaie.npu.dma_wait(%12 : !amdaie.async_token)
         amdaie.npu.dma_wait(%13 : !amdaie.async_token)
+        %bd_id_2 = amdaie.bd_id(%tile, %c2)
         %14 = amdaie.npu.half_dma_cpy_nd async %5(%10 [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %bd_id_2 channel = %channel use_next_bd = false start_bd = %bd_id_2) : !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        %bd_id_3 = amdaie.bd_id(%tile, %c3)
         %15 = amdaie.npu.half_dma_cpy_nd async %9(%11 [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %bd_id_3 channel = %channel_11 use_next_bd = false start_bd = %bd_id_3) : !amdaie.logicalobjectfifo<memref<262144xbf16>>
         amdaie.npu.dma_wait(%14 : !amdaie.async_token)
         amdaie.npu.dma_wait(%15 : !amdaie.async_token)
@@ -120,4 +124,3 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     return
   }
 }
-