diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
index 72ff124af..09f890879 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
@@ -1123,7 +1123,7 @@ void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result,
                               Value input, ArrayRef<OpFoldResult> offsets,
                               ArrayRef<OpFoldResult> sizes,
                               ArrayRef<OpFoldResult> strides, Value bdId,
-                              Value channel) {
+                              Value channel, Value nextBd, Value startBd) {
   SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
   SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
@@ -1131,7 +1131,7 @@ void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result,
   dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides);
   build(b, result, resultTypes, connection, input, dynamicOffsets, dynamicSizes,
         dynamicStrides, staticOffsets, staticSizes, staticStrides, bdId,
-        channel);
+        channel, nextBd, startBd);
 }
 
 // Build a NpuHalfDmaCpyNdOp with static entries.
@@ -1140,7 +1140,7 @@ void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result,
                               Value input, ArrayRef<int64_t> offsets,
                               ArrayRef<int64_t> sizes,
                               ArrayRef<int64_t> strides, mlir::Value bdId,
-                              Value channel) {
+                              Value channel, Value nextBd, Value startBd) {
   SmallVector<OpFoldResult> offsetValues = llvm::to_vector<4>(llvm::map_range(
       offsets,
       [&](int64_t v) -> OpFoldResult { return b.getI64IntegerAttr(v); }));
@@ -1152,7 +1152,7 @@ void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result,
       strides,
       [&](int64_t v) -> OpFoldResult { return b.getI64IntegerAttr(v); }));
   build(b, result, resultTypes, connection, input, offsetValues, sizeValues,
-        strideValues, bdId, channel);
+        strideValues, bdId, channel, nextBd, startBd);
 }
 
 // Build a NpuHalfDmaCpyNdOp with dynamic entries.
@@ -1160,7 +1160,7 @@ void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result,
                               TypeRange resultTypes, Value connection,
                               Value input, ValueRange offsets, ValueRange sizes,
                               ValueRange strides, mlir::Value bdId,
-                              Value channel) {
+                              Value channel, Value nextBd, Value startBd) {
   SmallVector<OpFoldResult> offsetValues = llvm::to_vector<4>(
       llvm::map_range(offsets, [](Value v) -> OpFoldResult { return v; }));
   SmallVector<OpFoldResult> sizeValues = llvm::to_vector<4>(
@@ -1168,7 +1168,7 @@ void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result,
   SmallVector<OpFoldResult> strideValues = llvm::to_vector<4>(
       llvm::map_range(strides, [](Value v) -> OpFoldResult { return v; }));
   build(b, result, resultTypes, connection, input, offsetValues, sizeValues,
-        strideValues, bdId, channel);
+        strideValues, bdId, channel, nextBd, startBd);
 }
 
 std::optional<int64_t> NpuHalfDmaCpyNdOp::getStaticBaseOffset() {
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
index 36985c8b9..371945da7 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
@@ -595,20 +595,32 @@ def AMDAIE_NpuHalfDmaCpyNdOp
     ShapedType::kDynamic encodes that the corresponding entry has a dynamic
     value.
 
+    It also supports the representation of DMA BD chaining using the, 
+    `next_bd`, and `start_bd` operands. The `next_bd` operand specifies 
+    the BD ID of the next DMA operation in the chain, if there is any.
+    
+    The `start_bd` operand specifies the BD ID of the first DMA operation in a sequence.
+    - If `start_bd` is the same as `bd_id`, it marks the start of a chain.
+    - If `start_bd` differs from `bd_id` and `next_bd` is set, it represents 
+      an intermediate operation in the chain.
+    - If `start_bd` differs from `bd_id` and `next_bd` is not set, it represents 
+      the end of the chain.
+      
     Example:
 
     ```mlir
     %2 = amdaie.connection(%1, %0) 
       : (!amdaie.logicalobjectfifo<memref<32x64xi32, 1>>,
       !amdaie.logicalobjectfifo<memref<32x1024xi32>>)
-    %bd_id = amdaie.bd_id(%tile_0_0, 0)
+    %bd_id_0 = amdaie.bd_id(%tile_0_0, 0)
+    %bd_id_1 = amdaie.bd_id(%tile_0_0, 1)
     %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
     ...
     amdaie.controlcode {
       %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} 
         : memref<32x1024xi32> -> !amdaie.logicalobjectfifo<memref<32768xi32>>
       %4 = amdaie.npu.half_dma_cpy_nd async %2(%0[0, 0] [32, 64] [1024, 1]
-        bd_id = %bd_id channel = %channel)
+        bd_id = %bd_id_0 channel = %channel next_bd = %bd_id_1 start_bd = %bd_id_0)
       ...
     }
     ```
@@ -624,7 +636,9 @@ def AMDAIE_NpuHalfDmaCpyNdOp
         DenseI64ArrayAttr:$static_sizes,
         DenseI64ArrayAttr:$static_strides,
         Optional<Index>:$bd_id,
-        Optional<Index>:$channel
+        Optional<Index>:$channel,
+        Optional<Index>:$next_bd,
+        Optional<Index>:$start_bd
   );
 
   let results = (outs Optional<AMDAIE_AsyncTokenType>:$async_token);
@@ -639,6 +653,8 @@ def AMDAIE_NpuHalfDmaCpyNdOp
     custom<DynamicIndexList>($strides, $static_strides)
     (`bd_id` `=` $bd_id^)?
     (`channel` `=` $channel^)?
+    (`next_bd` `=` $next_bd^)?
+    (`start_bd` `=` $start_bd^)?
     `)`
     attr-dict 
     `:` type($input)
@@ -649,16 +665,19 @@ def AMDAIE_NpuHalfDmaCpyNdOp
     OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection,
       "::mlir::Value":$input, "ArrayRef<OpFoldResult>":$offsets,
       "ArrayRef<OpFoldResult>":$sizes, "ArrayRef<OpFoldResult>":$strides,
-      "::mlir::Value":$bd_id, "::mlir::Value":$channel)>,
+      "::mlir::Value":$bd_id, "::mlir::Value":$channel,
+      CArg<"::mlir::Value", "nullptr">:$next_bd, CArg<"::mlir::Value", "nullptr">:$start_bd)>,
     // Build a NpuHalfDmaCpyNdOp with static entries.
     OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection,
       "::mlir::Value":$target, "ArrayRef<int64_t>":$offsets,
       "ArrayRef<int64_t>":$sizes, "ArrayRef<int64_t>":$strides,
-      "::mlir::Value":$bd_id, "::mlir::Value":$channel)>,
+      "::mlir::Value":$bd_id, "::mlir::Value":$channel,
+      CArg<"::mlir::Value", "nullptr">:$next_bd, CArg<"::mlir::Value", "nullptr">:$start_bd)>,
     // Build a NpuHalfDmaCpyNdOp with dynamic entries.
     OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection,
       "::mlir::Value":$input, "ValueRange":$offsets, "ValueRange":$sizes,
-      "ValueRange":$strides, "::mlir::Value":$bd_id, "::mlir::Value":$channel)>
+      "ValueRange":$strides, "::mlir::Value":$bd_id, "::mlir::Value":$channel,
+      CArg<"::mlir::Value", "nullptr">:$next_bd, CArg<"::mlir::Value", "nullptr">:$start_bd)>,
   ];
 
   let extraClassDeclaration = [{
@@ -674,9 +693,20 @@ def AMDAIE_NpuHalfDmaCpyNdOp
     }
 
     std::optional<BdIdOp> getBdIdOp() {
+      if (!getBdId()) return std::nullopt;
       return dyn_cast_if_present<BdIdOp>(getBdId().getDefiningOp());
     }
 
+    std::optional<BdIdOp> getNextBdIdOp() {
+      if (!getNextBd()) return std::nullopt;
+      return dyn_cast_if_present<BdIdOp>(getNextBd().getDefiningOp());
+    }
+
+    std::optional<BdIdOp> getStartBdIdOp() {
+      if (!getStartBd()) return std::nullopt;
+      return dyn_cast_if_present<BdIdOp>(getStartBd().getDefiningOp());
+    }
+
     // Return the input `amdaie.connection` operation.
     std::optional<ConnectionOp> getConnectionOp() {
       return dyn_cast_if_present<ConnectionOp>(getConnection().getDefiningOp());
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir
index c261f099a..c542e2627 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir
@@ -397,6 +397,7 @@ func.func @npu_dma_cpy_nd_all_operands(%arg0: !amdaie.logicalobjectfifo<memref<1
 // CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
 // CHECK-DAG:   %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
 // CHECK-DAG:   %[[BD_ID:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
+// CHECK-DAG:   %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C1]])
 // CHECK-DAG:   %[[CHANNEL:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA, direction = S2MM)
 // CHECK-DAG:   %[[CONNECTION_0:.+]] = amdaie.connection
 func.func @npu_half_dma_cpy_nd(%arg0: !amdaie.logicalobjectfifo<memref<2048xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>>) {
@@ -404,6 +405,7 @@ func.func @npu_half_dma_cpy_nd(%arg0: !amdaie.logicalobjectfifo<memref<2048xi32>
   %c1 = arith.constant 1 : index
   %tile_0_0 = amdaie.tile(%c0, %c0)
   %bd_id = amdaie.bd_id(%tile_0_0, %c0)
+  %bd_id_1 = amdaie.bd_id(%tile_0_0, %c1)
   %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = S2MM)
   %0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>>)
 // CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] []) : !amdaie.logicalobjectfifo<memref<2048xi32>>
@@ -416,6 +418,8 @@ func.func @npu_half_dma_cpy_nd(%arg0: !amdaie.logicalobjectfifo<memref<2048xi32>
   amdaie.npu.half_dma_cpy_nd %0(%arg0[%c0, 0] [%c0, 64] [%c0, 1] channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
 // CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] [] bd_id = %[[BD_ID]] channel = %[[CHANNEL]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
   amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] [] bd_id = %[[BD_ID]] channel = %[[CHANNEL]] next_bd = %[[BD_ID_1]] start_bd = %[[BD_ID]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+  amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] [] bd_id = %bd_id channel = %channel next_bd = %bd_id_1 start_bd = %bd_id) : !amdaie.logicalobjectfifo<memref<2048xi32>>
   return
 }
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp
index 60bb8144a..cf40de2b0 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp
@@ -110,8 +110,13 @@ struct HalfDmaCpyNdToNpuConverter final
     staticStrides.insert(staticStrides.begin(),
                          numIntraAddrDim - staticStrides.size(), 0);
 
-    bool useNextBd{false};
+    bool useNextBd = false;
     int32_t nextBd{0};
+    if (std::optional<AMDAIE::BdIdOp> nextBdIdOp = op.getNextBdIdOp()) {
+      nextBd = getConstantIndexOrAssert(nextBdIdOp.value().getValue());
+      useNextBd = true;
+    }
+
     bool validBd{true};
     int32_t lockRelVal{0};
     int32_t lockRelId{0};
@@ -208,6 +213,21 @@ struct HalfDmaCpyNdToNpuConverter final
         strides);
     if (failed(npuPushToQueueOp)) return failure();
     rewriter.replaceOp(op, *npuPushToQueueOp);
+
+    std::optional<AMDAIE::BdIdOp> nextBdIdOp = op.getNextBdIdOp();
+    if (nextBdIdOp) {
+      // `next_bd` is set, so either at the beginning or middle of a chain.
+      // No need to push to the queue, just erase the op.
+      rewriter.eraseOp(*npuPushToQueueOp);
+    } else {
+      std::optional<AMDAIE::BdIdOp> maybeStartBdIdOp = op.getStartBdIdOp();
+      if (maybeStartBdIdOp) {
+        // Update with the BD ID at the start of the chain.
+        AMDAIE::BdIdOp startBdIdOp = maybeStartBdIdOp.value();
+        uint32_t startBdId = getConstantIndexOrAssert(startBdIdOp.getValue());
+        npuPushToQueueOp->setBdId(startBdId);
+      }
+    }
     return success();
   }
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
index 670edeab4..2f0c6030d 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
@@ -142,7 +142,7 @@ LogicalResult foldDmaWaits(const AMDAIE::AMDAIEDeviceModel &deviceModel,
           rewriter.create<AMDAIE::NpuHalfDmaCpyNdOp>(
               op.getLoc(), resultTypeRange, op.getConnection(), op.getInput(),
               op.getMixedOffsets(), op.getMixedSizes(), op.getMixedStrides(),
-              op.getBdId(), op.getChannel());
+              op.getBdId(), op.getChannel(), op.getNextBd(), op.getStartBd());
           rewriter.eraseOp(op);
         }
       }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp
new file mode 100644
index 000000000..b21ceb025
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp
@@ -0,0 +1,270 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree-amd-aie/IR/AMDAIEOps.h"
+#include "iree-amd-aie/Transforms/Passes.h"
+#include "iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h"
+#include "iree-amd-aie/Transforms/Utils/AMDAIEUtils.h"
+#include "iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h"
+#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
+#include "mlir/IR/Iterators.h"
+#define DEBUG_TYPE "iree-amdaie-insert-dma-bd-chain"
+
+namespace mlir::iree_compiler::AMDAIE {
+
+namespace {
+
+using DmaChain = std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>;
+
+/// Utility function to update `next_bd` and `start_bd` operands.
+LogicalResult updateChainOperands(
+    IRRewriter &rewriter, SmallVector<AMDAIE::NpuHalfDmaCpyNdOp> &dmaOps) {
+  // Nothing to do if the DMA chain length is one or less.
+  if (dmaOps.size() < 2) return success();
+
+  Value startBdId = dmaOps[0].getBdId();
+  Operation *parentOp = dmaOps[0]->getParentOp();
+  // Chain the DMA ops.
+  for (unsigned i = 0; i < dmaOps.size() - 1; ++i) {
+    AMDAIE::NpuHalfDmaCpyNdOp currDmaOp = dmaOps[i];
+    if (currDmaOp->getParentOp() != parentOp) {
+      return currDmaOp.emitError(
+          "DMA operations to be chained must belong to the same scope");
+    }
+    Value nextBdId = dmaOps[i + 1].getBdId();
+    // No token is produced at the beginning or middle of a chain.
+    TypeRange token = TypeRange{};
+    rewriter.setInsertionPointAfter(currDmaOp);
+    rewriter.create<AMDAIE::NpuHalfDmaCpyNdOp>(
+        currDmaOp.getLoc(), token, currDmaOp.getConnection(),
+        currDmaOp.getInput(), currDmaOp.getMixedOffsets(),
+        currDmaOp.getMixedSizes(), currDmaOp.getMixedStrides(),
+        currDmaOp.getBdId(), currDmaOp.getChannel(), nextBdId, startBdId);
+    for (auto &use : currDmaOp->getUses()) rewriter.eraseOp(use.getOwner());
+    rewriter.eraseOp(currDmaOp);
+  }
+  // Last DMA op in the chain.
+  AMDAIE::NpuHalfDmaCpyNdOp lastDmaOp = dmaOps.back();
+  if (lastDmaOp->getParentOp() != parentOp) {
+    return lastDmaOp.emitError(
+        "DMA operations to be chained must belong to the same scope");
+  }
+  Value nextBdId = nullptr;
+  rewriter.setInsertionPointAfter(lastDmaOp);
+  auto lastDmaOpChained = rewriter.create<AMDAIE::NpuHalfDmaCpyNdOp>(
+      lastDmaOp.getLoc(), lastDmaOp.getResultTypes(), lastDmaOp.getConnection(),
+      lastDmaOp.getInput(), lastDmaOp.getMixedOffsets(),
+      lastDmaOp.getMixedSizes(), lastDmaOp.getMixedStrides(),
+      lastDmaOp.getBdId(), lastDmaOp.getChannel(), nextBdId, startBdId);
+  rewriter.replaceOp(lastDmaOp, lastDmaOpChained.getResults());
+  return success();
+}
+
+/// Utility function to determine if chains can grow further
+/// or require breaking.
+///
+/// Example:
+/// - Chain X currently holds BD IDs: [4, 5, 6, 7]
+/// - Chain Y currently holds BD IDs: [0, 1, 2, 3]
+/// - A new BD ID (0) needs to be added to the front (due to reverse
+/// traversing) of chain X.
+///
+/// Conflict resolution:
+/// - Chain Y must be broken because BD ID 0 is already assigned to it
+/// and must be released.
+/// - Chain X is also broken to prevent the new added BD ID (0) from
+/// invalidating chain Y.
+///
+/// Result:
+/// - Break both chains X and Y.
+///   - Chain X: [0] (the newly added BD ID).
+///   - Chain Y: [] (emptied after breaking).
+void checkForChainsToBeBroken(
+    uint32_t currBdId, const DmaChain &currDmaChain,
+    const DenseMap<DmaChain, DenseSet<uint32_t>> &dmaChainToBdIds,
+    SmallVector<DmaChain> &chainsToBreak) {
+  for (auto &[entry, bdIds] : dmaChainToBdIds) {
+    if (entry.first == currDmaChain.first && bdIds.contains(currBdId)) {
+      // Break the chain that contains the duplicate BD ID.
+      chainsToBreak.push_back(entry);
+      if (entry != currDmaChain) {
+        // Break the current chain as well.
+        chainsToBreak.push_back(currDmaChain);
+      }
+      break;
+    }
+  }
+}
+
+/// Traverse the control code in reverse order to create DMA BD chains. Reverse
+/// traversal simplifies handling duplicate BD IDs, preventing the need to
+/// revisit and modify earlier operations after processing later ones.
+LogicalResult insertDmaBdChain(const AMDAIE::AMDAIEDeviceModel &deviceModel,
+                               AMDAIE::ControlCodeOp controlCodeOp) {
+  IRRewriter rewriter(controlCodeOp->getContext());
+
+  // Move all BdIdOps to the beginning of the control code.
+  // This is to avoid dominance issues when chaining BD IDs.
+  SmallVector<Operation *> bdIdOps;
+  WalkResult res = controlCodeOp->walk([&](Operation *op) {
+    if (auto bdIdOp = dyn_cast<AMDAIE::BdIdOp>(op)) {
+      bdIdOps.push_back(op);
+    }
+    return WalkResult::advance();
+  });
+  for (Operation *op : llvm::reverse(bdIdOps)) {
+    op->moveBefore(&controlCodeOp.front());
+  }
+
+  // BD IDs that have been assigned in each tile.
+  DenseMap<DmaChain, DenseSet<uint32_t>> dmaChainToBdIds;
+  // Buffers the DMA ops that will be chained.
+  DenseMap<DmaChain, SmallVector<AMDAIE::NpuHalfDmaCpyNdOp>> dmaChainToDmaOps;
+
+  res = controlCodeOp->walk<WalkOrder::PostOrder,
+                            ReverseIterator>([&](Operation *op) {
+    if (auto npuHalfDmaCpyNdOp = dyn_cast<AMDAIE::NpuHalfDmaCpyNdOp>(op)) {
+      // Not shim, will be erased at ControlcodeLowering, ignore.
+      if (npuHalfDmaCpyNdOp.getMemorySpaceAsUInt() != 0) {
+        return WalkResult::advance();
+      }
+
+      // Get the connection op.
+      std::optional<AMDAIE::ConnectionOp> maybeConnectionOp =
+          npuHalfDmaCpyNdOp.getConnectionOp();
+      if (!maybeConnectionOp) {
+        npuHalfDmaCpyNdOp.emitOpError()
+            << "expected to operate on an `amdaie.connection`";
+        return WalkResult::interrupt();
+      }
+      AMDAIE::ConnectionOp connectionOp = maybeConnectionOp.value();
+
+      // Packet flow, do not chain BDs.
+      std::optional<AMDAIE::FlowOp> maybeFlowOp = connectionOp.getFlowOp();
+      if (!maybeFlowOp) {
+        connectionOp->emitOpError()
+            << "expected to operate on an `amdaie.flow`";
+        return WalkResult::interrupt();
+      }
+      AMDAIE::FlowOp flowOp = maybeFlowOp.value();
+      bool isPacketFlow = flowOp.getIsPacketFlow();
+      if (isPacketFlow) return WalkResult::advance();
+
+      // Repeat count > 1, do not chain BDs.
+      int32_t repeatCount = 1;
+      uint8_t numAddrDim = DmaDimConfig(deviceModel, 0).maxNbDims;
+      SmallVector<OpFoldResult> sizes = npuHalfDmaCpyNdOp.getMixedSizes();
+      SmallVector<OpFoldResult> strides = npuHalfDmaCpyNdOp.getMixedStrides();
+      if (!sizes.empty() && !strides.empty()) {
+        int64_t size = getConstantIndexOrAssert(sizes[0]);
+        int64_t stride = getConstantIndexOrAssert(strides[0]);
+        if (sizes.size() == numAddrDim || stride == 0) {
+          repeatCount = size;
+        }
+      }
+      if (repeatCount > 1) return WalkResult::advance();
+
+      // Get the BD ID and tile op.
+      std::optional<AMDAIE::BdIdOp> maybeBdIdOp = npuHalfDmaCpyNdOp.getBdIdOp();
+      if (!maybeBdIdOp) {
+        npuHalfDmaCpyNdOp.emitOpError() << "must have a BD ID op";
+        return WalkResult::interrupt();
+      }
+      AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value();
+      uint32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue());
+      AMDAIE::TileOp tileOp =
+          dyn_cast_if_present<AMDAIE::TileOp>(bdIdOp.getTile().getDefiningOp());
+      if (!tileOp) {
+        bdIdOp.emitOpError() << "must operate on an `amdaie.tile`";
+        return WalkResult::interrupt();
+      }
+
+      // Any duplicate BD ID from the same tile indicates that the chain
+      // cannot grow further and requires breaking to release the
+      // conflicting BD ID.
+      SmallVector<DmaChain> chainsToBreak;
+      DmaChain currDmaChain = {tileOp, connectionOp};
+      checkForChainsToBeBroken(bdId, currDmaChain, dmaChainToBdIds,
+                               chainsToBreak);
+
+      // If the chains are not to be continued, update DMA operands using
+      // the `updateChainOperands` function.
+      if (!chainsToBreak.empty()) {
+        for (auto &entry : chainsToBreak) {
+          // Since the controlcode is traversed in reverse order, we need to
+          // restore the original order of the DMA operations.
+          std::reverse(dmaChainToDmaOps[entry].begin(),
+                       dmaChainToDmaOps[entry].end());
+          if (failed(updateChainOperands(rewriter, dmaChainToDmaOps[entry])))
+            WalkResult::interrupt();
+          dmaChainToBdIds[entry].clear();
+          dmaChainToDmaOps[entry].clear();
+        }
+      }
+      dmaChainToBdIds[currDmaChain].insert(bdId);
+      dmaChainToDmaOps[currDmaChain].push_back(npuHalfDmaCpyNdOp);
+    }
+    return WalkResult::advance();
+  });
+
+  // Build the remaining chains.
+  for (auto &[entry, _] : dmaChainToBdIds) {
+    // Since the controlcode is traversed in reverse order, we need to
+    // restore the original order of the DMA operations.
+    std::reverse(dmaChainToDmaOps[entry].begin(),
+                 dmaChainToDmaOps[entry].end());
+    if (failed(updateChainOperands(rewriter, dmaChainToDmaOps[entry])))
+      return failure();
+  }
+
+  if (res.wasInterrupted()) return failure();
+  return success();
+}
+
+class AMDAIEInsertDmaBdChainPass
+    : public impl::AMDAIEInsertDmaBdChainBase<AMDAIEInsertDmaBdChainPass> {
+ public:
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AMDAIEDialect>();
+  }
+
+  AMDAIEInsertDmaBdChainPass() = default;
+  AMDAIEInsertDmaBdChainPass(const AMDAIEInsertDmaBdChainPass &pass){};
+  void runOnOperation() override;
+};
+
+void AMDAIEInsertDmaBdChainPass::runOnOperation() {
+  Operation *parentOp = getOperation();
+
+  auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp);
+  std::optional<AMDAIEDevice> maybeDevice = getConfigAMDAIEDevice(targetAttr);
+  if (!maybeDevice) {
+    parentOp->emitOpError()
+        << "has no AMDAIEDevice in the target attribute configuration. This "
+           "device-specific information is required to lower control code "
+           "ops.";
+    return signalPassFailure();
+  }
+  AMDAIE::AMDAIEDeviceModel deviceModel =
+      AMDAIE::getDeviceModel(maybeDevice.value());
+
+  WalkResult res = parentOp->walk([&](AMDAIE::WorkgroupOp workgroupOp) {
+    AMDAIE::ControlCodeOp controlCodeOp = workgroupOp.getControlCode();
+    if (failed(insertDmaBdChain(deviceModel, controlCodeOp))) {
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+  if (res.wasInterrupted()) return signalPassFailure();
+}
+
+}  // namespace
+
+std::unique_ptr<Pass> createAMDAIEInsertDmaBdChainPass() {
+  return std::make_unique<AMDAIEInsertDmaBdChainPass>();
+}
+
+}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
index 81f020f00..012c004c0 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
@@ -78,6 +78,7 @@ iree_cc_library(
     "AMDAIEHoistForAffineApply.cpp"
     "AMDAIEHoistLogicalObjFifo.cpp"
     "AMDAIEInsertCores.cpp"
+    "AMDAIEInsertDmaBdChain.cpp"
     "AMDAIEInsertInfiniteLoopAroundCoreBlock.cpp"
     "AMDAIEInsertLoopsForVectorization.cpp"
     "AMDAIELinkExecutables.cpp"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
index 6cdf14d1b..0fbed8c81 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
@@ -60,6 +60,7 @@ namespace mlir::iree_compiler::AMDAIE {
 #define GEN_PASS_DEF_AMDAIEHOISTLOGICALOBJFIFO
 #define GEN_PASS_DEF_AMDAIEINSERTAIEWORKGROUP
 #define GEN_PASS_DEF_AMDAIEINSERTCORES
+#define GEN_PASS_DEF_AMDAIEINSERTDMABDCHAIN
 #define GEN_PASS_DEF_AMDAIEINSERTINFINITELOOPAROUNDCOREBLOCK
 #define GEN_PASS_DEF_AMDAIEINSERTLOOPSFORVECTORIZATION
 #define GEN_PASS_DEF_AMDAIELINKEXECUTABLES
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
index c657cef51..55d2bd4d9 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -660,6 +660,7 @@ void addAMDAIEObjectFifoLoweringPasses(
   passManager.addPass(createAMDAIEAssignPacketIdsPass());
 
   passManager.addPass(createAMDAIENpuDmaToHalfDmaCpyNdPass());
+  passManager.addPass(createAMDAIEInsertDmaBdChainPass());
   passManager.addPass(createAMDAIEFoldDmaWaitsPass());
   passManager.addPass(createAMDAIEControlCodeLoweringPass());
   passManager.addPass(createAMDAIEControlCodeToTransactionPass());
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
index 5fefdf02f..c2ce74ac7 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
@@ -199,6 +199,9 @@ std::unique_ptr<Pass> createAMDAIEHoistForLoopAffineApplyPass();
 /// operands.
 std::unique_ptr<Pass> createAMDAIEHoistLogicalObjFifoPass();
 
+/// Create pass to chain DMA BD IDs by updating next_bd operands.
+std::unique_ptr<Pass> createAMDAIEInsertDmaBdChainPass();
+
 /// Create a pass to transform linalg.generics into a form which benefits later
 /// vectorization passes (to vector and aievec dialects).
 std::unique_ptr<Pass> createAMDAIEInsertLoopsForVectorizationPass(
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
index f7ac2c8a1..96d80a184 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
@@ -336,6 +336,13 @@ def AMDAIEInsertCores :
   let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEInsertCoresPass()";
 }
 
+def AMDAIEInsertDmaBdChain :
+    Pass<"iree-amdaie-insert-dma-bd-chain"> {
+  let summary = "Chain DMA BD IDs by updating next_bd operands.";
+  let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEInsertDmaBdChainPass()";
+}
+
+
 def AMDAIEInsertInfiniteLoopAroundCoreBlock :
     Pass<"iree-amdaie-insert-infinite-loop-around-core-block", ""> {
   let summary = "Inserts an infinite loop around each `amdaie.core`'s block.";
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
index 5191f1c50..943aa2dcc 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
@@ -50,6 +50,7 @@ iree_lit_test_suite(
     "hoist_for_affine_apply.mlir"
     "hoist_logical_obj_fifo.mlir"
     "insert_cores.mlir"
+    "insert_dma_bd_chain.mlir"
     "insert_infinite_loop_around_core_block.mlir"
     "insert_loops_for_vectorization.mlir"
     "localize_logical_objectfifo.mlir"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir
index 6a9bfb85d..26bad8b3b 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir
@@ -128,3 +128,53 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     return
   }
 }
+
+// -----
+
+// CHECK-LABEL: @half_npu_dma_cpy_nd_chain
+// CHECK:       amdaie.controlcode
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @half_npu_dma_cpy_nd_chain() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    amdaie.workgroup {
+      %tile = amdaie.tile(%c0, %c1)
+      %tile_0 = amdaie.tile(%c0, %c0)
+      %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
+      %lock = amdaie.lock(%tile(4), 4)
+      %lock_2 = amdaie.lock(%tile(5), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S)
+      %channel_3 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM)
+      %3 = amdaie.flow({%channel} -> {%channel_3}) {is_packet_flow = false}
+      %4 = amdaie.connection(%0 {%channel_3}, %2 {%channel}, flow = %3) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      amdaie.controlcode {
+        %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %1, 64 : memref<64x32xi32>
+        %bd_id = amdaie.bd_id(%tile_0, %c0)
+        %bd_id_1 = amdaie.bd_id(%tile_0, %c1)
+        %bd_id_2 = amdaie.bd_id(%tile_0, %c2)
+// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 1 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = true, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
+        amdaie.npu.half_dma_cpy_nd %4(%5[] [] [] bd_id = %bd_id channel = %channel next_bd = %bd_id_1 start_bd = %bd_id) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: amdaie.npu.write_bd {bd_id = 1 : ui32, buffer_length = 2048 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 2 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 2048>, strides = array<i32: 0, 0, 1>, use_next_bd = true, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 1 : ui32, col = 0 : ui32, offset = 0 : ui32}
+        amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id_1 channel = %channel next_bd = %bd_id_2 start_bd = %bd_id) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: amdaie.npu.write_bd {bd_id = 2 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 16>, strides = array<i32: 64, 8, 1>, use_next_bd = false, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 2 : ui32, col = 0 : ui32, offset = 0 : ui32}
+// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
+// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
+        %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id_2 channel = %channel start_bd = %bd_id) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%6 : !amdaie.async_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_dma_bd_chain.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_dma_bd_chain.mlir
new file mode 100644
index 000000000..b3e85ab1f
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_dma_bd_chain.mlir
@@ -0,0 +1,284 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-insert-dma-bd-chain)" --split-input-file --verify-diagnostics %s | FileCheck %s
+
+// Expect a single DMA BD chain, containing the IDs: [0, 1].
+// CHECK-LABEL: @single_bd_chain
+// CHECK:       %[[CHANNEL:.+]] = amdaie.channel
+// CHECK:       %[[CONNECTION:.+]] = amdaie.connection
+// CHECK:       amdaie.controlcode
+// CHECK:         %[[BD_ID_0:.+]] = amdaie.bd_id
+// CHECK:         %[[BD_ID_1:.+]] = amdaie.bd_id
+// CHECK:         %[[OBJECT_FIFO:.+]] = amdaie.logicalobjectfifo.from_memref
+// CHECK:         amdaie.npu.half_dma_cpy_nd  %[[CONNECTION]](%[[OBJECT_FIFO]] [] [] [] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL]] next_bd = %[[BD_ID_1]] start_bd = %[[BD_ID_0]])
+// CHECK:         %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO]] [] [] [] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL]] start_bd = %[[BD_ID_0]])
+// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @single_bd_chain() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    amdaie.workgroup {
+      %tile = amdaie.tile(%c0, %c0)
+      %tile_0 = amdaie.tile(%c0, %c1)
+      %buffer = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32>
+      %buffer_2 = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32>
+      %lock = amdaie.lock(%tile_0(0), 0)
+      %lock_3 = amdaie.lock(%tile_0(1), 0)
+      %channel = amdaie.channel(%tile, 0, port_type = DMA, direction = MM2S)
+      %channel_4 = amdaie.channel(%tile_0, 0, port_type = DMA, direction = S2MM)
+      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<512x512xbf16>
+      %1 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_2}, {%lock}, {%lock_3}) : memref<1024xbf16, 1 : i32>, memref<1024xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2>
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo<memref<512x512xbf16>>
+      %3 = amdaie.flow({%channel} -> {%channel_4}) {is_packet_flow = false}
+      %4 = amdaie.connection(%1 {%channel_4}, %2 {%channel}, flow = %3) {connection_type = #amdaie<connection_type Circuit>} : (!amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<512x512xbf16>>)
+      amdaie.controlcode {
+        memref.assume_alignment %0, 64 : memref<512x512xbf16>
+        %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<512x512xbf16> -> !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        %bd_id = amdaie.bd_id(%tile, %c0)
+        %6 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id channel = %channel start_bd = %bd_id) : !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        amdaie.npu.dma_wait(%6 : !amdaie.async_token)
+        %bd_id_1 = amdaie.bd_id(%tile, %c1)
+        %7 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_1 channel = %channel start_bd = %bd_id_1) : !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        amdaie.npu.dma_wait(%7 : !amdaie.async_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// Expect no chaining happens, when repeat_count > 1.
+// CHECK-LABEL: @no_bd_chain_repeat_count
+// CHECK:       %[[CHANNEL:.+]] = amdaie.channel
+// CHECK:       %[[CONNECTION:.+]] = amdaie.connection
+// CHECK:       amdaie.controlcode
+// CHECK:         %[[BD_ID_0:.+]] = amdaie.bd_id
+// CHECK:         %[[BD_ID_1:.+]] = amdaie.bd_id
+// CHECK:         %[[OBJECT_FIFO:.+]] = amdaie.logicalobjectfifo.from_memref
+// CHECK:         %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO]] [0, 0] [2, 1] [0, 1] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL]] start_bd = %[[BD_ID_0]])
+// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
+// CHECK:         %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO]] [0, 0] [2, 1] [0, 1] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL]] start_bd = %[[BD_ID_1]])
+// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token)
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @no_bd_chain_repeat_count() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    amdaie.workgroup {
+      %tile = amdaie.tile(%c0, %c0)
+      %tile_0 = amdaie.tile(%c0, %c1)
+      %buffer = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32>
+      %buffer_2 = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32>
+      %lock = amdaie.lock(%tile_0(0), 0)
+      %lock_3 = amdaie.lock(%tile_0(1), 0)
+      %channel = amdaie.channel(%tile, 0, port_type = DMA, direction = MM2S)
+      %channel_4 = amdaie.channel(%tile_0, 0, port_type = DMA, direction = S2MM)
+      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<512x512xbf16>
+      %1 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_2}, {%lock}, {%lock_3}) : memref<1024xbf16, 1 : i32>, memref<1024xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2>
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo<memref<512x512xbf16>>
+      %3 = amdaie.flow({%channel} -> {%channel_4}) {is_packet_flow = false}
+      %4 = amdaie.connection(%1 {%channel_4}, %2 {%channel}, flow = %3) {connection_type = #amdaie<connection_type Circuit>} : (!amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<512x512xbf16>>)
+      amdaie.controlcode {
+        memref.assume_alignment %0, 64 : memref<512x512xbf16>
+        %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<512x512xbf16> -> !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        %bd_id = amdaie.bd_id(%tile, %c0)
+        %6 = amdaie.npu.half_dma_cpy_nd async %4(%5 [0, 0] [2, 1] [0, 1] bd_id = %bd_id channel = %channel start_bd = %bd_id) : !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        amdaie.npu.dma_wait(%6 : !amdaie.async_token)
+        %bd_id_1 = amdaie.bd_id(%tile, %c1)
+        %7 = amdaie.npu.half_dma_cpy_nd async %4(%5 [0, 0] [2, 1] [0, 1] bd_id = %bd_id_1 channel = %channel start_bd = %bd_id_1) : !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        amdaie.npu.dma_wait(%7 : !amdaie.async_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// Expect the test to fail, as controlcode loop is not unrolled.
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @error_different_scopes() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 8 : index
+    amdaie.workgroup {
+      %tile = amdaie.tile(%c0, %c0)
+      %tile_0 = amdaie.tile(%c0, %c1)
+      %buffer = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32>
+      %buffer_2 = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32>
+      %lock = amdaie.lock(%tile_0(0), 0)
+      %lock_3 = amdaie.lock(%tile_0(1), 0)
+      %channel = amdaie.channel(%tile, 0, port_type = DMA, direction = MM2S)
+      %channel_4 = amdaie.channel(%tile_0, 0, port_type = DMA, direction = S2MM)
+      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<512x512xbf16>
+      %1 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_2}, {%lock}, {%lock_3}) : memref<1024xbf16, 1 : i32>, memref<1024xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2>
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo<memref<512x512xbf16>>
+      %3 = amdaie.flow({%channel} -> {%channel_4}) {is_packet_flow = false}
+      %4 = amdaie.connection(%1 {%channel_4}, %2 {%channel}, flow = %3) {connection_type = #amdaie<connection_type Circuit>} : (!amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<512x512xbf16>>)
+      amdaie.controlcode {
+        memref.assume_alignment %0, 64 : memref<512x512xbf16>
+        %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<512x512xbf16> -> !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        %bd_id = amdaie.bd_id(%tile, %c0)
+        %6 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id channel = %channel start_bd = %bd_id) : !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        amdaie.npu.dma_wait(%6 : !amdaie.async_token)
+        scf.for %i = %c0 to %c1 step %c8 {
+          %bd_id_1 = amdaie.bd_id(%tile, %c1)
+          // expected-error @+1 {{DMA operations to be chained must belong to the same scope}}
+          %7 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_1 channel = %channel start_bd = %bd_id_1) : !amdaie.logicalobjectfifo<memref<262144xbf16>>
+          amdaie.npu.dma_wait(%7 : !amdaie.async_token)
+        }
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// Expect two BD ID chains, as the chain breaks whenever duplicate BD ID occurs.
+// The first chain: [0, 1, 2]. The second chain: [1, 2].
+// CHECK-LABEL: @duplicate_bd_id
+// CHECK:       %[[CHANNEL:.+]] = amdaie.channel
+// CHECK:       %[[CONNECTION:.+]] = amdaie.connection
+// CHECK:       amdaie.controlcode
+// CHECK:         %[[BD_ID_0:.+]] = amdaie.bd_id
+// CHECK:         %[[BD_ID_1:.+]] = amdaie.bd_id
+// CHECK:         %[[BD_ID_2:.+]] = amdaie.bd_id
+// CHECK:         %[[OBJECT_FIFO:.+]] = amdaie.logicalobjectfifo.from_memref
+// CHECK:         amdaie.npu.half_dma_cpy_nd  %[[CONNECTION]](%[[OBJECT_FIFO]] [] [] [] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL]]  next_bd = %[[BD_ID_1]] start_bd = %[[BD_ID_0]])
+// CHECK:         amdaie.npu.half_dma_cpy_nd  %[[CONNECTION]](%[[OBJECT_FIFO]] [] [] [] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL]]  next_bd = %[[BD_ID_2]] start_bd = %[[BD_ID_0]])
+// CHECK:         %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO]] [] [] [] bd_id = %[[BD_ID_2]] channel = %[[CHANNEL]] start_bd = %[[BD_ID_0]])
+// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
+// CHECK:         amdaie.npu.half_dma_cpy_nd  %[[CONNECTION]](%[[OBJECT_FIFO]] [] [] [] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL]]  next_bd = %[[BD_ID_2]] start_bd = %[[BD_ID_1]])
+// CHECK:         %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO]] [] [] [] bd_id = %[[BD_ID_2]] channel = %[[CHANNEL]] start_bd = %[[BD_ID_1]])
+// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token)
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @duplicate_bd_id() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    amdaie.workgroup {
+      %tile = amdaie.tile(%c0, %c0)
+      %tile_0 = amdaie.tile(%c0, %c1)
+      %buffer = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32>
+      %buffer_2 = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32>
+      %lock = amdaie.lock(%tile_0(0), 0)
+      %lock_3 = amdaie.lock(%tile_0(1), 0)
+      %channel = amdaie.channel(%tile, 0, port_type = DMA, direction = MM2S)
+      %channel_4 = amdaie.channel(%tile_0, 0, port_type = DMA, direction = S2MM)
+      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<512x512xbf16>
+      %1 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_2}, {%lock}, {%lock_3}) : memref<1024xbf16, 1 : i32>, memref<1024xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2>
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo<memref<512x512xbf16>>
+      %3 = amdaie.flow({%channel} -> {%channel_4}) {is_packet_flow = false}
+      %4 = amdaie.connection(%1 {%channel_4}, %2 {%channel}, flow = %3) {connection_type = #amdaie<connection_type Circuit>} : (!amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<512x512xbf16>>)
+      amdaie.controlcode {
+        memref.assume_alignment %0, 64 : memref<512x512xbf16>
+        %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<512x512xbf16> -> !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        %bd_id = amdaie.bd_id(%tile, %c0)
+        %6 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id channel = %channel start_bd = %bd_id) : !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        amdaie.npu.dma_wait(%6 : !amdaie.async_token)
+        %bd_id_1 = amdaie.bd_id(%tile, %c1)
+        %7 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_1 channel = %channel start_bd = %bd_id_1) : !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        amdaie.npu.dma_wait(%7 : !amdaie.async_token)
+        %bd_id_2 = amdaie.bd_id(%tile, %c2)
+        %8 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_2 channel = %channel start_bd = %bd_id_2) : !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        amdaie.npu.dma_wait(%8 : !amdaie.async_token)
+        %9 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_1 channel = %channel start_bd = %bd_id_1) : !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        amdaie.npu.dma_wait(%9 : !amdaie.async_token)
+        %10 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_2 channel = %channel start_bd = %bd_id_2) : !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        amdaie.npu.dma_wait(%10 : !amdaie.async_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// Expect two DMA BD chains interleaved, as they belong to different connections.
+// One chain contains the IDs: [0, 2], the other chain contains: [1, 3].
+// CHECK-LABEL: @two_connections
+// CHECK:       %[[CHANNEL_0:.+]] = amdaie.channel
+// CHECK:       %[[CHANNEL_1:.+]] = amdaie.channel
+// CHECK:       %[[CHANNEL_2:.+]] = amdaie.channel
+// CHECK:       %[[CHANNEL_3:.+]] = amdaie.channel
+// CHECK:       %[[CONNECTION_0:.+]] = amdaie.connection
+// CHECK:       %[[CONNECTION_1:.+]] = amdaie.connection
+// CHECK:       amdaie.controlcode
+// CHECK:         %[[BD_ID_0:.+]] = amdaie.bd_id
+// CHECK:         %[[BD_ID_1:.+]] = amdaie.bd_id
+// CHECK:         %[[BD_ID_2:.+]] = amdaie.bd_id
+// CHECK:         %[[BD_ID_3:.+]] = amdaie.bd_id
+// CHECK:         %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_memref
+// CHECK:         %[[OBJECT_FIFO_1:.+]] = amdaie.logicalobjectfifo.from_memref
+// CHECK:         amdaie.npu.half_dma_cpy_nd  %[[CONNECTION_0]](%[[OBJECT_FIFO_0]] [] [] [] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL_0]] next_bd = %[[BD_ID_2]] start_bd = %[[BD_ID_0]])
+// CHECK:         amdaie.npu.half_dma_cpy_nd  %[[CONNECTION_1]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL_2]] next_bd = %[[BD_ID_3]] start_bd = %[[BD_ID_1]])
+// CHECK:         %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[OBJECT_FIFO_0]] [] [] [] bd_id = %[[BD_ID_2]] channel = %[[CHANNEL_0]] start_bd = %[[BD_ID_0]]) 
+// CHECK:         %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_1]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID_3]] channel = %[[CHANNEL_2]] start_bd = %[[BD_ID_1]])
+// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
+// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token)
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @two_connections() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+    amdaie.workgroup {
+      %tile = amdaie.tile(%c0, %c0)
+      %tile_0 = amdaie.tile(%c0, %c1)
+      %buffer = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32>
+      %buffer_4 = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32>
+      %buffer_5 = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32>
+      %buffer_6 = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32>
+      %lock = amdaie.lock(%tile_0(0), 0)
+      %lock_7 = amdaie.lock(%tile_0(1), 0)
+      %lock_8 = amdaie.lock(%tile_0(2), 0)
+      %lock_9 = amdaie.lock(%tile_0(3), 0)
+      %channel = amdaie.channel(%tile, 0, port_type = DMA, direction = MM2S)
+      %channel_10 = amdaie.channel(%tile_0, 0, port_type = DMA, direction = S2MM)
+      %channel_11 = amdaie.channel(%tile, 1, port_type = DMA, direction = MM2S)
+      %channel_12 = amdaie.channel(%tile_0, 1, port_type = DMA, direction = S2MM)
+      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<512x512xbf16>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<512x512xbf16>
+      %2 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_4}, {%lock}, {%lock_7}) : memref<1024xbf16, 1 : i32>, memref<1024xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2>
+      %3 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo<memref<512x512xbf16>>
+      %4 = amdaie.flow({%channel} -> {%channel_10}) {is_packet_flow = false}
+      %5 = amdaie.connection(%2 {%channel_10}, %3 {%channel}, flow = %4) {connection_type = #amdaie<connection_type Circuit>} : (!amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<512x512xbf16>>)
+      %6 = amdaie.logicalobjectfifo.from_buffers({%buffer_5, %buffer_6}, {%lock_8}, {%lock_9}) : memref<1024xbf16, 1 : i32>, memref<1024xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2>
+      %7 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo<memref<512x512xbf16>>
+      %8 = amdaie.flow({%channel_11} -> {%channel_12}) {is_packet_flow = false}
+      %9 = amdaie.connection(%6 {%channel_11}, %7 {%channel_12}, flow = %8) {connection_type = #amdaie<connection_type Circuit>} : (!amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<512x512xbf16>>)
+      amdaie.controlcode {
+        memref.assume_alignment %0, 64 : memref<512x512xbf16>
+        %10 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<512x512xbf16> -> !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        %11 = amdaie.logicalobjectfifo.from_memref %1, {%tile} : memref<512x512xbf16> -> !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        %bd_id = amdaie.bd_id(%tile, %c0)
+        %12 = amdaie.npu.half_dma_cpy_nd async %5(%10 [] [] [] bd_id = %bd_id channel = %channel start_bd = %bd_id) : !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        %bd_id_1 = amdaie.bd_id(%tile, %c1)
+        %13 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_1 channel = %channel_11 start_bd = %bd_id_1) : !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        amdaie.npu.dma_wait(%12 : !amdaie.async_token)
+        amdaie.npu.dma_wait(%13 : !amdaie.async_token)
+        %bd_id_2 = amdaie.bd_id(%tile, %c2)
+        %14 = amdaie.npu.half_dma_cpy_nd async %5(%10 [] [] [] bd_id = %bd_id_2 channel = %channel start_bd = %bd_id_2) : !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        %bd_id_3 = amdaie.bd_id(%tile, %c3)
+        %15 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_3 channel = %channel_11 start_bd = %bd_id_3) : !amdaie.logicalobjectfifo<memref<262144xbf16>>
+        amdaie.npu.dma_wait(%14 : !amdaie.async_token)
+        amdaie.npu.dma_wait(%15 : !amdaie.async_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}