From 71e17ed31f0182801894d351a6ad5f112be8efc9 Mon Sep 17 00:00:00 2001
From: Zhewen Yu <zhewenyu@amd.com>
Date: Wed, 11 Dec 2024 11:54:11 +0000
Subject: [PATCH] Move AMDAIEAssignChannelsPass before
 AMDAIEAssignNpuDmaBdIdsPass (#980)

This PR is to resolve the following comment:


https://github.com/nod-ai/iree-amd-aie/blob/e623a6a36e7b92b46501f7f1c952b114da990a6a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp#L269-L274

- passes are now reordered
- add a utility function to retrieve `ChannelOp` from given
`NpuDmaCpyNdOp`
- unit test `assign-npu-dma-bd-ids.mlir` is refactored
---
 .../AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp     |  22 ++
 .../AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td      |   4 +
 .../Transforms/AMDAIEAssignNpuDmaBdIds.cpp    |  43 +--
 .../iree-amd-aie/Transforms/Passes.cpp        |   8 +-
 .../test/assign_npu_dma_bd_ids.mlir           | 313 ++++++++++--------
 5 files changed, 225 insertions(+), 165 deletions(-)
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
index d98607dfa..72ff124af 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
@@ -1066,6 +1066,28 @@ bool NpuDmaCpyNdOp::hasDmaWaitOpUser() {
                       [](auto userOp) { return isa<NpuDmaWaitOp>(userOp); });
 }
 
+FailureOr<AMDAIE::ChannelOp> NpuDmaCpyNdOp::getSourceChannelOp() {
+  AMDAIE::ConnectionOp connectionOp = getConnectionOp();
+  if (!connectionOp)
+    return emitOpError() << "should operate on an `amdaie.connection` op";
+  if (connectionOp.getSourceChannels().size() != 1)
+    return emitOpError() << "expected a single source channel";
+  auto sourceChannelOp = dyn_cast<AMDAIE::ChannelOp>(
+      connectionOp.getSourceChannels()[0].getDefiningOp());
+  return sourceChannelOp;
+}
+
+FailureOr<AMDAIE::ChannelOp> NpuDmaCpyNdOp::getTargetChannelOp() {
+  AMDAIE::ConnectionOp connectionOp = getConnectionOp();
+  if (!connectionOp)
+    return emitOpError() << "should operate on an `amdaie.connection` op";
+  if (connectionOp.getTargetChannels().size() != 1)
+    return emitOpError() << "expected a single target channel";
+  auto targetChannelOp = dyn_cast<AMDAIE::ChannelOp>(
+      connectionOp.getTargetChannels()[0].getDefiningOp());
+  return targetChannelOp;
+}
+
 namespace {
 struct NpuDmaCpyNdOpReplacementBuilder {
   static void replace(NpuDmaCpyNdOp dmaOp, PatternRewriter &rewriter,
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
index fa45c468e..36985c8b9 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
@@ -558,6 +558,10 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [
       if (!bdIdValue) return nullptr;
       return dyn_cast_if_present<BdIdOp>(bdIdValue.getDefiningOp());
     }
+    
+    FailureOr<AMDAIE::ChannelOp> getSourceChannelOp();
+
+    FailureOr<AMDAIE::ChannelOp> getTargetChannelOp();
 
     // A utility to create a new doubly strided operation from this one with a
     // new set of source and target offsets, sizes and strides.
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp
index 872eceac2..4c751d6d6 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp
@@ -146,13 +146,27 @@ template <CopyOpOperateOn OperateOn>
 FailureOr<AMDAIE::BdIdOp> getBdIdOp(
     IRRewriter &rewriter, AMDAIE::NpuDmaCpyNdOp &npuDmaOp,
     DenseMap<Value, ChannelBdIdGenerator> &shimTileToGeneratorMap,
-    DenseMap<AMDAIE::BdIdOp, SmallVector<uint32_t>> &bdIdOpToBdIdsMap,
-    uint32_t channel) {
-  FailureOr<AMDAIE::TileOp> tileOp =
+    DenseMap<AMDAIE::BdIdOp, SmallVector<uint32_t>> &bdIdOpToBdIdsMap) {
+  // Get the TileOp.
+  FailureOr<AMDAIE::TileOp> maybeTileOp =
       getGeneratorTileOp<OperateOn>(npuDmaOp, shimTileToGeneratorMap);
-  if (failed(tileOp)) return failure();
+  if (failed(maybeTileOp)) return failure();
+  AMDAIE::TileOp tileOp = maybeTileOp.value();
+
+  // Get the channel.
+  FailureOr<AMDAIE::ChannelOp> maybeChannelOp;
+  if constexpr (OperateOn == CopyOpOperateOn::Source) {
+    maybeChannelOp = npuDmaOp.getSourceChannelOp();
+  } else if constexpr (OperateOn == CopyOpOperateOn::Target) {
+    maybeChannelOp = npuDmaOp.getTargetChannelOp();
+  } else {
+    return npuDmaOp.emitOpError()
+           << "Function can only operate on Source or Target";
+  }
+  if (failed(maybeChannelOp)) return failure();
+  uint32_t channel = maybeChannelOp.value().getValue();
 
-  ChannelBdIdGenerator &generator = shimTileToGeneratorMap[tileOp->getResult()];
+  ChannelBdIdGenerator &generator = shimTileToGeneratorMap[tileOp.getResult()];
   rewriter.setInsertionPoint(npuDmaOp);
   if (scf::ForOp loop = npuDmaOp->getParentOfType<scf::ForOp>();
       loop && getNumberIterations(loop)) {
@@ -165,7 +179,7 @@ FailureOr<AMDAIE::BdIdOp> getBdIdOp(
 
     // Get the number of BD IDs will be assigned to current DMA op.
     uint32_t numRequired = 0;
-    getNumRequiredBdIds(loop, npuDmaOp, *tileOp, shimTileToGeneratorMap,
+    getNumRequiredBdIds(loop, npuDmaOp, tileOp, shimTileToGeneratorMap,
                         numRequired);
     uint32_t numAvailable = generator.getNumAvailableBdIds(channel);
     uint32_t size = std::max(numAvailable / numRequired, 1u);
@@ -193,7 +207,7 @@ FailureOr<AMDAIE::BdIdOp> getBdIdOp(
               iv,
           });
       AMDAIE::BdIdOp bdIdOp = rewriter.create<AMDAIE::BdIdOp>(
-          rewriter.getUnknownLoc(), *tileOp, affineApply.getResult());
+          rewriter.getUnknownLoc(), tileOp, affineApply.getResult());
       bdIdOpToBdIdsMap[bdIdOp] = bdIds;
       return bdIdOp;
     }
@@ -206,7 +220,7 @@ FailureOr<AMDAIE::BdIdOp> getBdIdOp(
   auto constant = rewriter.create<arith::ConstantOp>(
       rewriter.getUnknownLoc(), rewriter.getIndexAttr(bdId.value()));
   AMDAIE::BdIdOp bdIdOp = rewriter.create<AMDAIE::BdIdOp>(
-      rewriter.getUnknownLoc(), *tileOp, constant.getResult());
+      rewriter.getUnknownLoc(), tileOp, constant.getResult());
   return bdIdOp;
 };
 
@@ -266,13 +280,6 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) {
     }
   });
 
-  // TODO(jornt): Temporarily use channel 0 for all DMAs. This should
-  // return correct results for Shim channels, however, for generality
-  // towards other DMAs and future hardware generations, channel
-  // assignment should happen before BD assignemnt. This requires more
-  // refactoring.
-  const uint32_t channel = 0;
-
   DenseMap<AMDAIE::BdIdOp, SmallVector<uint32_t>> bdIdOpToBdIdsMap;
   // Walk `amdaie.npu_dma_cpy_nd` and  `amdaie.dma_wait` operations and assign
   // and release BD IDs when encountering the respective operations using the
@@ -282,8 +289,7 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) {
     if (auto npuDmaOp = dyn_cast<AMDAIE::NpuDmaCpyNdOp>(op)) {
       if (npuDmaOp.getSource()) {
         FailureOr<AMDAIE::BdIdOp> bdIdOp = getBdIdOp<CopyOpOperateOn::Source>(
-            rewriter, npuDmaOp, shimTileToGeneratorMap, bdIdOpToBdIdsMap,
-            channel);
+            rewriter, npuDmaOp, shimTileToGeneratorMap, bdIdOpToBdIdsMap);
         if (failed(bdIdOp)) return WalkResult::interrupt();
         rewriter.setInsertionPoint(npuDmaOp);
         npuDmaOp = rewriter.replaceOpWithNewOp<AMDAIE::NpuDmaCpyNdOp>(
@@ -296,8 +302,7 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) {
       }
       if (npuDmaOp.getTarget()) {
         FailureOr<AMDAIE::BdIdOp> bdIdOp = getBdIdOp<CopyOpOperateOn::Target>(
-            rewriter, npuDmaOp, shimTileToGeneratorMap, bdIdOpToBdIdsMap,
-            channel);
+            rewriter, npuDmaOp, shimTileToGeneratorMap, bdIdOpToBdIdsMap);
         if (failed(bdIdOp)) return WalkResult::interrupt();
         rewriter.setInsertionPoint(npuDmaOp);
         (void)rewriter.replaceOpWithNewOp<AMDAIE::NpuDmaCpyNdOp>(
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
index 9ece915fe..4d91808c4 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -635,6 +635,10 @@ void addAMDAIEObjectFifoLoweringPasses(
   passManager.addPass(createCanonicalizerPass());
   passManager.addPass(createAMDAIEDmaCSEPass());
 
+  passManager.addPass(createAMDAIEAssignChannelsPass());
+  passManager.addPass(createCSEPass());
+  passManager.addPass(createCanonicalizerPass());
+
   passManager.addPass(createAMDAIEAssignNpuDmaBdIdsPass());
   passManager.addPass(createCSEPass());
   passManager.addPass(createCanonicalizerPass());
@@ -650,10 +654,6 @@ void addAMDAIEObjectFifoLoweringPasses(
   passManager.addPass(createAMDAIEConvertCoreForallToForPass());
   passManager.addPass(createCanonicalizerPass());
 
-  passManager.addPass(createAMDAIEAssignChannelsPass());
-  passManager.addPass(createCSEPass());
-  passManager.addPass(createCanonicalizerPass());
-
   passManager.addPass(createAMDAIEObjFifoBufferizationPass());
   passManager.addPass(createAMDAIETemporaryAllocBufferizationPass());
   passManager.addPass(createAMDAIEConnectionToFlowPass());
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir
index ed3464aaa..785a20ff7 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir
@@ -11,15 +11,15 @@ module {
 
 // -----
 
+// Expect constant BD ID 0 is assigned to the DMA copy operation.
+
 // CHECK-LABEL: @single_dma_cpy_nd_on_source
 // CHECK:       %[[C0:.+]] = arith.constant 0 : index
 // CHECK:       amdaie.workgroup
-// CHECK-DAG:     %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
-// CHECK-DAG:     %[[FROM_MEMREF:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-// CHECK:         %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:         %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
 // CHECK:         amdaie.controlcode
 // CHECK:           %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
-// CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]])
+// CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]])
 // CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA]] : !amdaie.async_source_token)
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
@@ -29,13 +29,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     amdaie.workgroup {
       %tile_0_0 = amdaie.tile(%c0, %c0)
       %tile_0_1 = amdaie.tile(%c0, %c1)
+      %channel_0 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
+      %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM)
+      %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>
       %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>
-      %0 = amdaie.circular_dma_cpy_nd(%from_memref_1[] [] [], %placeholder[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
+      %connection = amdaie.connection(%from_memref_0 {%channel_1}, %placeholder {%channel_0}) {connection_type = #amdaie<connection_type Circuit>} : (!amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
       amdaie.controlcode {
-        %1 = amdaie.npu.dma_cpy_nd async_source %0([] [] [], %from_memref_0[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
-        amdaie.npu.dma_wait(%1 : !amdaie.async_source_token)
+        %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %0 = amdaie.npu.dma_cpy_nd async_source %connection([] [] [], %from_memref_1[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        amdaie.npu.dma_wait(%0 : !amdaie.async_source_token)
         amdaie.end
       }
     }
@@ -45,15 +47,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
+// Expect constant BD ID 0 is assigned to the DMA copy operation.
+
 // CHECK-LABEL: @single_dma_cpy_nd_on_target
 // CHECK:       %[[C0:.+]] = arith.constant 0 : index
 // CHECK:       amdaie.workgroup
-// CHECK-DAG:     %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
-// CHECK-DAG:     %[[FROM_MEMREF:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-// CHECK:         %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:         %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
 // CHECK:         amdaie.controlcode
 // CHECK:           %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
-// CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd async_target %[[CIRC_DMA]](%[[FROM_MEMREF]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]], [] [] [])
+// CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd async_target %{{.+}}(%{{.+}}[0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]], [] [] [])
 // CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA]] : !amdaie.async_target_token)
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
@@ -63,13 +65,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     amdaie.workgroup {
       %tile_0_0 = amdaie.tile(%c0, %c0)
       %tile_0_1 = amdaie.tile(%c0, %c1)
+      %channel_0 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = S2MM)
+      %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S)
+      %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>
       %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>
-      %0 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %from_memref_1[] [] []) : (!amdaie.logicalobjectfifo<memref<8x16xi32>>, !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>)
+      %connection = amdaie.connection(%placeholder {%channel_0}, %from_memref_0 {%channel_1}) {connection_type = #amdaie<connection_type Circuit>} : (!amdaie.logicalobjectfifo<memref<8x16xi32>>, !amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>)
       amdaie.controlcode {
-        %1 = amdaie.npu.dma_cpy_nd async_target %0(%from_memref_0[0, 0, 0] [1, 8, 16] [128, 16, 1], [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
-        amdaie.npu.dma_wait(%1 : !amdaie.async_target_token)
+        %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %0 = amdaie.npu.dma_cpy_nd async_target %connection(%from_memref_1[0, 0, 0] [1, 8, 16] [128, 16, 1], [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        amdaie.npu.dma_wait(%0 : !amdaie.async_target_token)
         amdaie.end
       }
     }
@@ -79,27 +83,23 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
+// Expect all DMA copy operations are assigned with constant BD ID 0, because they are all on different shim tiles.
+
 // CHECK-LABEL: @multiple_dma_cpy_on_diff_tiles
-// CHECK:       %[[C0:.+]] = arith.constant 0 : index
-// CHECK:       %[[C1:.+]] = arith.constant 1 : index
-// CHECK:       %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
 // CHECK:       amdaie.workgroup
 // CHECK-DAG:     %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
 // CHECK-DAG:     %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]])
 // CHECK-DAG:     %[[TILE_2_0:.+]] = amdaie.tile(%[[C2]], %[[C0]])
-// CHECK-DAG:     %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-// CHECK-DAG:     %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_1_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-// CHECK-DAG:     %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_2_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-// CHECK:         %[[CIRC_DMA_0:.+]] = amdaie.circular_dma_cpy_nd
-// CHECK:         %[[CIRC_DMA_1:.+]] = amdaie.circular_dma_cpy_nd
-// CHECK:         %[[CIRC_DMA_2:.+]] = amdaie.circular_dma_cpy_nd
 // CHECK:         amdaie.controlcode
 // CHECK:           %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
-// CHECK:           %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]])
+// CHECK:           %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]])
 // CHECK:           %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[C0]])
-// CHECK:           %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_1]]([] [] [], %[[FROM_MEMREF_1]][0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]])
+// CHECK:           %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]])
 // CHECK:           %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_2_0]], %[[C0]])
-// CHECK:           %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_2]]([] [] [], %[[FROM_MEMREF_2]][0] [128] [1] bd_id = %[[BD_ID_2]])
+// CHECK:           %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0] [128] [1] bd_id = %[[BD_ID_2]])
 // CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_0]] : !amdaie.async_source_token)
 // CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_1]] : !amdaie.async_source_token)
 // CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_2]] : !amdaie.async_source_token)
@@ -114,20 +114,26 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %tile_1_0 = amdaie.tile(%c1, %c0)
       %tile_2_0 = amdaie.tile(%c2, %c0)
       %tile_0_1 = amdaie.tile(%c0, %c1)
-      %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %placeholder1 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %placeholder2 = amdaie.logicalobjectfifo.placeholder{%tile_2_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_1_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_2_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>
-      %dma0 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
-      %dma1 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
-      %dma2 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder2[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
+      %channel_0 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
+      %channel_1 = amdaie.channel(%tile_1_0, 0, port_type = DMA, direction = MM2S)
+      %channel_2 = amdaie.channel(%tile_2_0, 0, port_type = DMA, direction = MM2S)
+      %channel_3 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM)
+      %channel_4 = amdaie.channel(%tile_0_1, 1, port_type = DMA, direction = S2MM)
+      %channel_5 = amdaie.channel(%tile_0_1, 2, port_type = DMA, direction = S2MM)
+      %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>
+      %placeholder_0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
+      %placeholder_1 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
+      %placeholder_2 = amdaie.logicalobjectfifo.placeholder{%tile_2_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
+      %connection_0 = amdaie.connection(%from_memref_0 {%channel_3}, %placeholder_0 {%channel_0}) {connection_type = #amdaie<connection_type Circuit>} : (!amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
+      %connection_1 = amdaie.connection(%from_memref_0 {%channel_4}, %placeholder_1 {%channel_1}) {connection_type = #amdaie<connection_type Circuit>} : (!amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
+      %connection_2 = amdaie.connection(%from_memref_0 {%channel_5}, %placeholder_2 {%channel_2}) {connection_type = #amdaie<connection_type Circuit>} : (!amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
       amdaie.controlcode {
-        %0 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_0[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
-        %1 = amdaie.npu.dma_cpy_nd async_source %dma1([] [] [], %from_memref_1[0, 0] [8, 16] [16, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
-        %2 = amdaie.npu.dma_cpy_nd async_source %dma2([] [] [], %from_memref_2[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_1_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_2_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %0 = amdaie.npu.dma_cpy_nd async_source %connection_0([] [] [], %from_memref_1[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %1 = amdaie.npu.dma_cpy_nd async_source %connection_1([] [] [], %from_memref_2[0, 0] [8, 16] [16, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %2 = amdaie.npu.dma_cpy_nd async_source %connection_2([] [] [], %from_memref_3[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
         amdaie.npu.dma_wait(%0 : !amdaie.async_source_token)
         amdaie.npu.dma_wait(%1 : !amdaie.async_source_token)
         amdaie.npu.dma_wait(%2 : !amdaie.async_source_token)
@@ -140,23 +146,23 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
+// Expect BD IDs: 0, 1, 2 are assigned to the DMA copy operations, as incremental assignment is used.
+
 // CHECK-LABEL: @multiple_dma_cpy_with_wait_after_each
 // CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 // CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
 // CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
 // CHECK:       amdaie.workgroup
-// CHECK-DAG:     %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
-// CHECK-DAG:     %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-// CHECK:         %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:         %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
 // CHECK:         amdaie.controlcode
 // CHECK:           %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
-// CHECK:           %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]])
+// CHECK:           %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]])
 // CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_0]] : !amdaie.async_source_token)
 // CHECK:           %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C1]])
-// CHECK:           %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]])
+// CHECK:           %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]])
 // CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_1]] : !amdaie.async_source_token)
 // CHECK:           %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C2]])
-// CHECK:           %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0] [128] [1] bd_id = %[[BD_ID_2]])
+// CHECK:           %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0] [128] [1] bd_id = %[[BD_ID_2]])
 // CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_2]] : !amdaie.async_source_token)
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
@@ -166,17 +172,19 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     amdaie.workgroup {
       %tile_0_0 = amdaie.tile(%c0, %c0)
       %tile_0_1 = amdaie.tile(%c0, %c1)
-      %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>
-      %0 = amdaie.circular_dma_cpy_nd(%from_memref_1[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
+      %channel_0 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
+      %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM)
+      %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>
+      %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
+      %connection = amdaie.connection(%from_memref_0 {%channel_1}, %placeholder {%channel_0}) {connection_type = #amdaie<connection_type Circuit>} : (!amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
       amdaie.controlcode {
-        %1 = amdaie.npu.dma_cpy_nd async_source %0([] [] [], %from_memref_0[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %0 = amdaie.npu.dma_cpy_nd async_source %connection([] [] [], %from_memref_1[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        amdaie.npu.dma_wait(%0 : !amdaie.async_source_token)
+        %1 = amdaie.npu.dma_cpy_nd async_source %connection([] [] [], %from_memref_1[0, 0] [8, 16] [16, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
         amdaie.npu.dma_wait(%1 : !amdaie.async_source_token)
-        %2 = amdaie.npu.dma_cpy_nd async_source %0([] [] [], %from_memref_0[0, 0] [8, 16] [16, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %2 = amdaie.npu.dma_cpy_nd async_source %connection([] [] [], %from_memref_1[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
         amdaie.npu.dma_wait(%2 : !amdaie.async_source_token)
-        %3 = amdaie.npu.dma_cpy_nd async_source %0([] [] [], %from_memref_0[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
-        amdaie.npu.dma_wait(%3 : !amdaie.async_source_token)
         amdaie.end
       }
     }
@@ -186,21 +194,21 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
+// Expect BD IDs: 0, 1, 2 are assigned to the DMA copy operations, as incremental assignment is used and IDs are only release after waits.
+
 // CHECK-LABEL: @multiple_dma_cpy_with_wait_after_all
 // CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 // CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
 // CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
 // CHECK:       amdaie.workgroup
 // CHECK:         %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
-// CHECK-DAG:     %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-// CHECK:         %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
 // CHECK:         amdaie.controlcode
 // CHECK:           %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
-// CHECK:           %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]])
+// CHECK:           %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]])
 // CHECK:           %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C1]])
-// CHECK:           %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]])
+// CHECK:           %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]])
 // CHECK:           %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C2]])
-// CHECK:           %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0] [128] [1] bd_id = %[[BD_ID_2]])
+// CHECK:           %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0] [128] [1] bd_id = %[[BD_ID_2]])
 // CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_0]] : !amdaie.async_source_token)
 // CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_1]] : !amdaie.async_source_token)
 // CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_2]] : !amdaie.async_source_token)
@@ -212,17 +220,19 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     amdaie.workgroup {
       %tile_0_0 = amdaie.tile(%c0, %c0)
       %tile_0_1 = amdaie.tile(%c0, %c1)
-      %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>
-      %0 = amdaie.circular_dma_cpy_nd(%from_memref_1[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
+      %channel_0 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
+      %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM)
+      %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>
+      %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
+      %connection = amdaie.connection(%from_memref_0 {%channel_1}, %placeholder {%channel_0}) {connection_type = #amdaie<connection_type Circuit>} : (!amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
       amdaie.controlcode {
-        %1 = amdaie.npu.dma_cpy_nd async_source %0([] [] [], %from_memref_0[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
-        %2 = amdaie.npu.dma_cpy_nd async_source %0([] [] [], %from_memref_0[0, 0] [8, 16] [16, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
-        %3 = amdaie.npu.dma_cpy_nd async_source %0([] [] [], %from_memref_0[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %0 = amdaie.npu.dma_cpy_nd async_source %connection([] [] [], %from_memref_1[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %1 = amdaie.npu.dma_cpy_nd async_source %connection([] [] [], %from_memref_1[0, 0] [8, 16] [16, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %2 = amdaie.npu.dma_cpy_nd async_source %connection([] [] [], %from_memref_1[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        amdaie.npu.dma_wait(%0 : !amdaie.async_source_token)
         amdaie.npu.dma_wait(%1 : !amdaie.async_source_token)
         amdaie.npu.dma_wait(%2 : !amdaie.async_source_token)
-        amdaie.npu.dma_wait(%3 : !amdaie.async_source_token)
         amdaie.end
       }
     }
@@ -231,40 +241,37 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 }
 
 // -----
+
+// Expect two DMA copy operations at the innermost loop have BD IDs as expressions. #map0: 1~15, #map1: 0~15 
+
 // CHECK: #map = affine_map<(d0) -> (d0 mod 15 + 1)>
 // CHECK: #map1 = affine_map<(d0) -> (d0 mod 16)>
 // CHECK-LABEL: @nested_loops_multi_tiles
-// CHECK:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK:   %[[C2:.+]] = arith.constant 2 : index
-// CHECK:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
 // CHECK:       amdaie.workgroup
 // CHECK-DAG:     %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
 // CHECK-DAG:     %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]])
 // CHECK-DAG:     %[[TILE_2_0:.+]] = amdaie.tile(%[[C2]], %[[C0]])
-// CHECK-DAG:     %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-// CHECK-DAG:     %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_1_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-// CHECK-DAG:     %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_2_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-// CHECK:         %[[CIRC_DMA_0:.+]] = amdaie.circular_dma_cpy_nd
-// CHECK:         %[[CIRC_DMA_1:.+]] = amdaie.circular_dma_cpy_nd
-// CHECK:         %[[CIRC_DMA_2:.+]] = amdaie.circular_dma_cpy_nd
 // CHECK:         amdaie.controlcode
 // CHECK:           %[[BD_ID_0_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
-// CHECK:           %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1] bd_id = %[[BD_ID_0_0]])
+// CHECK:           %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1] bd_id = %[[BD_ID_0_0]])
 // CHECK:           scf.forall (%{{.+}}, %{{.+}}) in (2, 2)
 // CHECK:             %[[BD_ID_1_0:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[C0]])
-// CHECK:             %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_1]]([] [] [], %[[FROM_MEMREF_1]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_1_0]])
+// CHECK:             %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_1_0]])
 // CHECK:             scf.for %[[LOOP_VAR_0:.+]] = %[[C0]] to %[[C6]] step %[[C1]]
 // CHECK:               %[[VAR_0:.+]] = affine.apply #map(%[[LOOP_VAR_0]])
 // CHECK:               %[[BD_ID_1_1:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[VAR_0]])
-// CHECK:               %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_1]]([] [] [], %[[FROM_MEMREF_1]][0, 0] [1, 128] [128, 1] bd_id = %[[BD_ID_1_1]])
+// CHECK:               %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0, 0] [1, 128] [128, 1] bd_id = %[[BD_ID_1_1]])
 // CHECK:               %[[BD_ID_0_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[VAR_0]])
-// CHECK:               %[[NPU_DMA_3:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][0] [128] [1] bd_id = %[[BD_ID_0_1]])
+// CHECK:               %[[NPU_DMA_3:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[0] [128] [1] bd_id = %[[BD_ID_0_1]])
 // CHECK:               amdaie.npu.dma_wait(%[[NPU_DMA_2]] : !amdaie.async_source_token)
 // CHECK:               amdaie.npu.dma_wait(%[[NPU_DMA_3]] : !amdaie.async_source_token)
 // CHECK:               %[[VAR_1:.+]] = affine.apply #map1(%[[LOOP_VAR_0]])
 // CHECK:               %[[BD_ID_2_0:.+]] = amdaie.bd_id(%[[TILE_2_0]], %[[VAR_1]])
-// CHECK:               %[[NPU_DMA_4:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_2]]([] [] [], %[[FROM_MEMREF_2]][] [] [] bd_id = %[[BD_ID_2_0]])
+// CHECK:               %[[NPU_DMA_4:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[] [] [] bd_id = %[[BD_ID_2_0]])
 // CHECK:               amdaie.npu.dma_wait(%[[NPU_DMA_4]] : !amdaie.async_source_token)
 // CHECK:             }
 // CHECK:             amdaie.npu.dma_wait(%[[NPU_DMA_1]] : !amdaie.async_source_token)
@@ -282,26 +289,32 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %tile_1_0 = amdaie.tile(%c1, %c0)
       %tile_2_0 = amdaie.tile(%c2, %c0)
       %tile_0_1 = amdaie.tile(%c0, %c1)
-      %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %placeholder1 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %placeholder2 = amdaie.logicalobjectfifo.placeholder{%tile_2_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_1_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_2_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>
-      %dma0 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
-      %dma1 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
-      %dma2 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder2[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
+      %channel_0 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
+      %channel_1 = amdaie.channel(%tile_1_0, 0, port_type = DMA, direction = MM2S)
+      %channel_2 = amdaie.channel(%tile_2_0, 0, port_type = DMA, direction = MM2S)
+      %channel_3 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM)
+      %channel_4 = amdaie.channel(%tile_0_1, 1, port_type = DMA, direction = S2MM)
+      %channel_5 = amdaie.channel(%tile_0_1, 2, port_type = DMA, direction = S2MM)
+      %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>
+      %placeholder_0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
+      %placeholder_1 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
+      %placeholder_2 = amdaie.logicalobjectfifo.placeholder{%tile_2_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
+      %connection_0 = amdaie.connection(%from_memref_0 {%channel_3}, %placeholder_0 {%channel_0}) {connection_type = #amdaie<connection_type Circuit>} : (!amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
+      %connection_1 = amdaie.connection(%from_memref_0 {%channel_4}, %placeholder_1 {%channel_1}) {connection_type = #amdaie<connection_type Circuit>} : (!amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
+      %connection_2 = amdaie.connection(%from_memref_0 {%channel_5}, %placeholder_2 {%channel_2}) {connection_type = #amdaie<connection_type Circuit>} : (!amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
       amdaie.controlcode {
-        %0 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_1_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_2_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %0 = amdaie.npu.dma_cpy_nd async_source %connection_0([] [] [], %from_memref_1[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
         scf.forall (%arg4, %arg5) in (2, 2) {
-          %1 = amdaie.npu.dma_cpy_nd async_source %dma1([] [] [], %from_memref_1[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+          %1 = amdaie.npu.dma_cpy_nd async_source %connection_1([] [] [], %from_memref_2[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
           scf.for %arg6 = %c0 to %c6 step %c1 {
-            %2 = amdaie.npu.dma_cpy_nd async_source %dma1([] [] [], %from_memref_1[0, 0] [1, 128] [128, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
-            %3 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_0[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+            %2 = amdaie.npu.dma_cpy_nd async_source %connection_1([] [] [], %from_memref_2[0, 0] [1, 128] [128, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+            %3 = amdaie.npu.dma_cpy_nd async_source %connection_0([] [] [], %from_memref_1[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
             amdaie.npu.dma_wait(%2 : !amdaie.async_source_token)
             amdaie.npu.dma_wait(%3 : !amdaie.async_source_token)
-            %4 = amdaie.npu.dma_cpy_nd async_source %dma2([] [] [], %from_memref_2[] [] []) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+            %4 = amdaie.npu.dma_cpy_nd async_source %connection_2([] [] [], %from_memref_3[] [] []) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
             amdaie.npu.dma_wait(%4 : !amdaie.async_source_token)
           }
           amdaie.npu.dma_wait(%1 : !amdaie.async_source_token)
@@ -316,33 +329,32 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
+// Expect all three DMA copy operations have BD IDs as expressions. #map0: 0~15, #map1: 0~7, #map2: 8~15
+// BD IDs used by #map0 are released before the innermost loop, so that they can be reused by #map1 and #map2.
+
 // CHECK: #map = affine_map<(d0) -> (d0 mod 16)>
 // CHECK: #map1 = affine_map<(d0) -> (d0 mod 8)>
 // CHECK: #map2 = affine_map<(d0) -> (d0 mod 8 + 8)>
 // CHECK-LABEL: @nested_loops_wait_before_innerloop
-// CHECK:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK:   %[[C2:.+]] = arith.constant 2 : index
-// CHECK:   %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
 // CHECK:       amdaie.workgroup
-// CHECK-DAG:     %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
-// CHECK-DAG:     %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-// CHECK-DAG:     %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-// CHECK-DAG:     %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-// CHECK:         %[[CIRC_DMA_0:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:        %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
 // CHECK:         amdaie.controlcode
 // CHECK:           scf.for %[[LOOP_VAR_0:.+]] = %[[C0]] to %[[C4]] step %[[C1]]
 // CHECK:             %[[VAR_0:.+]] = affine.apply #map(%[[LOOP_VAR_0]])
 // CHECK:             %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[VAR_0]])
-// CHECK:             %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][] [] [] bd_id = %[[BD_ID_0]])
+// CHECK:             %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[] [] [] bd_id = %[[BD_ID_0]])
 // CHECK:             amdaie.npu.dma_wait(%[[NPU_DMA_0]] : !amdaie.async_source_token)
 // CHECK:             scf.for %[[LOOP_VAR_1:.+]] = %[[C0]] to %[[C2]] step %[[C1]]
 // CHECK:               %[[VAR_1:.+]] = affine.apply #map1(%[[LOOP_VAR_1]])
 // CHECK:               %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[VAR_1]])
-// CHECK:               %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_target %[[CIRC_DMA_0]](%[[FROM_MEMREF_1]][] [] [] bd_id = %[[BD_ID_1]], [] [] [])
+// CHECK:               %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_target %{{.+}}(%{{.+}}[] [] [] bd_id = %[[BD_ID_1]], [] [] [])
 // CHECK:               %[[VAR_2:.+]] = affine.apply #map2(%[[LOOP_VAR_1]])
 // CHECK:               %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[VAR_2]])
-// CHECK:               %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_2]][] [] [] bd_id = %[[BD_ID_2]])
+// CHECK:               %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[] [] [] bd_id = %[[BD_ID_2]])
 // CHECK:               amdaie.npu.dma_wait(%[[NPU_DMA_1]] : !amdaie.async_target_token)
 // CHECK:               amdaie.npu.dma_wait(%[[NPU_DMA_2]] : !amdaie.async_source_token)
 // CHECK:             }
@@ -356,19 +368,28 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     %c4 = arith.constant 4 : index
     amdaie.workgroup {
       %tile_0_0 = amdaie.tile(%c0, %c0)
-      %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_0} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>
-      %dma0 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
+      %tile_0_1 = amdaie.tile(%c0, %c1)
+      %channel_0 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
+      %channel_1 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = S2MM)
+      %channel_2 = amdaie.channel(%tile_0_0, 1, port_type = DMA, direction = MM2S)
+      %channel_3 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM)
+      %channel_4 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S)
+      %channel_5 = amdaie.channel(%tile_0_1, 1, port_type = DMA, direction = S2MM)
+      %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>
+      %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
+      %connection_0 = amdaie.connection(%from_memref_0 {%channel_3}, %placeholder {%channel_0}) {connection_type = #amdaie<connection_type Circuit>} : (!amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
+      %connection_1 = amdaie.connection(%placeholder {%channel_1}, %from_memref_0 {%channel_4}) {connection_type = #amdaie<connection_type Circuit>} : (!amdaie.logicalobjectfifo<memref<8x16xi32>>, !amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>)
+      %connection_2 = amdaie.connection(%from_memref_0 {%channel_5}, %placeholder {%channel_2}) {connection_type = #amdaie<connection_type Circuit>} : (!amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
       amdaie.controlcode {
+        %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
         scf.for %arg4 = %c0 to %c4 step %c1 {
-          %0 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_0[] [] []) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+          %0 = amdaie.npu.dma_cpy_nd async_source %connection_0([] [] [], %from_memref_1[] [] []) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
           amdaie.npu.dma_wait(%0 : !amdaie.async_source_token)
           scf.for %arg5 = %c0 to %c2 step %c1 {
-            %1 = amdaie.npu.dma_cpy_nd async_target %dma0(%from_memref_1[] [] [], [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
-            %2 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_2[] [] []) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>            
+            %1 = amdaie.npu.dma_cpy_nd async_target %connection_1(%from_memref_2[] [] [], [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+            %2 = amdaie.npu.dma_cpy_nd async_source %connection_2([] [] [], %from_memref_3[] [] []) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>            
             amdaie.npu.dma_wait(%1 : !amdaie.async_target_token)
             amdaie.npu.dma_wait(%2 : !amdaie.async_source_token)
           }
@@ -382,32 +403,31 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
+// Expect all three DMA copy operations have BD IDs as expressions. #map0: 0~1, #map1: 2~8, #map2: 9~15.
+// BD IDs used by #map0 are released after the innermost loop, so that they cannot be reused by #map1 and #map2.
+
 // CHECK: #map = affine_map<(d0) -> (d0 mod 2)>
 // CHECK: #map1 = affine_map<(d0) -> (d0 mod 7 + 2)>
 // CHECK: #map2 = affine_map<(d0) -> (d0 mod 7 + 9)>
 // CHECK-LABEL: @nested_loops_wait_after_innerloop
-// CHECK:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK:   %[[C2:.+]] = arith.constant 2 : index
-// CHECK:   %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
 // CHECK:       amdaie.workgroup
-// CHECK-DAG:     %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
-// CHECK-DAG:     %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-// CHECK-DAG:     %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-// CHECK-DAG:     %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-// CHECK:         %[[CIRC_DMA_0:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:         %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
 // CHECK:         amdaie.controlcode
 // CHECK:           scf.for %[[LOOP_VAR_0:.+]] = %[[C0]] to %[[C4]] step %[[C1]]
 // CHECK:             %[[VAR_0:.+]] = affine.apply #map(%[[LOOP_VAR_0]])
 // CHECK:             %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[VAR_0]])
-// CHECK:             %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][] [] [] bd_id = %[[BD_ID_0]])
+// CHECK:             %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[] [] [] bd_id = %[[BD_ID_0]])
 // CHECK:             scf.for %[[LOOP_VAR_1:.+]] = %[[C0]] to %[[C2]] step %[[C1]]
 // CHECK:               %[[VAR_1:.+]] = affine.apply #map1(%[[LOOP_VAR_1]])
 // CHECK:               %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[VAR_1]])
-// CHECK:               %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_target %[[CIRC_DMA_0]](%[[FROM_MEMREF_1]][] [] [] bd_id = %[[BD_ID_1]], [] [] [])
+// CHECK:               %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_target %{{.+}}(%{{.+}}[] [] [] bd_id = %[[BD_ID_1]], [] [] [])
 // CHECK:               %[[VAR_2:.+]] = affine.apply #map2(%[[LOOP_VAR_1]])
 // CHECK:               %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[VAR_2]])
-// CHECK:               %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_2]][] [] [] bd_id = %[[BD_ID_2]])
+// CHECK:               %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %{{.+}}([] [] [], %{{.+}}[] [] [] bd_id = %[[BD_ID_2]])
 // CHECK:               amdaie.npu.dma_wait(%[[NPU_DMA_1]] : !amdaie.async_target_token)
 // CHECK:               amdaie.npu.dma_wait(%[[NPU_DMA_2]] : !amdaie.async_source_token)
 // CHECK:             }
@@ -422,18 +442,27 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     %c4 = arith.constant 4 : index
     amdaie.workgroup {
       %tile_0_0 = amdaie.tile(%c0, %c0)
-      %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
-      %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_0} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>
-      %dma0 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
+      %tile_0_1 = amdaie.tile(%c0, %c1)
+      %channel_0 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
+      %channel_1 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = S2MM)
+      %channel_2 = amdaie.channel(%tile_0_0, 1, port_type = DMA, direction = MM2S)
+      %channel_3 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM)
+      %channel_4 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S)
+      %channel_5 = amdaie.channel(%tile_0_1, 1, port_type = DMA, direction = S2MM)
+      %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>
+      %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
+      %connection_0 = amdaie.connection(%from_memref_0 {%channel_3}, %placeholder {%channel_0}) {connection_type = #amdaie<connection_type Circuit>} : (!amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
+      %connection_1 = amdaie.connection(%placeholder {%channel_1}, %from_memref_0 {%channel_4}) {connection_type = #amdaie<connection_type Circuit>} : (!amdaie.logicalobjectfifo<memref<8x16xi32>>, !amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>)
+      %connection_2 = amdaie.connection(%from_memref_0 {%channel_5}, %placeholder {%channel_2}) {connection_type = #amdaie<connection_type Circuit>} : (!amdaie.logicalobjectfifo<memref<128xi32, 1>, 2>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
       amdaie.controlcode {
+        %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
         scf.for %arg4 = %c0 to %c4 step %c1 {
-          %0 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_0[] [] []) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+          %0 = amdaie.npu.dma_cpy_nd async_source %connection_0([] [] [], %from_memref_1[] [] []) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
           scf.for %arg5 = %c0 to %c2 step %c1 {
-            %1 = amdaie.npu.dma_cpy_nd async_target %dma0(%from_memref_1[] [] [], [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
-            %2 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_2[] [] []) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>            
+            %1 = amdaie.npu.dma_cpy_nd async_target %connection_1(%from_memref_2[] [] [], [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+            %2 = amdaie.npu.dma_cpy_nd async_source %connection_2([] [] [], %from_memref_3[] [] []) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>            
             amdaie.npu.dma_wait(%1 : !amdaie.async_target_token)
             amdaie.npu.dma_wait(%2 : !amdaie.async_source_token)
           }