From 2243dd86d316568247e678c227fa41a249af03cb Mon Sep 17 00:00:00 2001
From: Zhewen Yu <zhewenyu@amd.com>
Date: Mon, 9 Dec 2024 22:53:25 +0000
Subject: [PATCH] Add a pass to fold DMA waits (#962)

Each DMA channel has a task queue with the depth of 4. DMA wait is only
required for every 4 pushes, reducing unnecessary synchronization.

Example:
https://gist.github.com/Yu-Zhewen/5f569b56c7b1f1a8715a7c4c3bf9e609

Results compared to 7c4b98571d86d7ec25daa20f1d0ac9e2be8a8c05:
| Test (MxKxN) | Instruction Size Before (Words) | Instruction Size
After (Words) |

|---------------|---------------------------------|--------------------------------|
| 512x4096x512 | 1228 | 1132 |
| 512x512x4096 | 820 | 772 |
| 4096x512x512 | 4628 | 4244 |

This optimization is orthogonal to DMA chaining #931.

---------

Co-authored-by: James Newling <james.newling@gmail.com>
---
 .../AMDAIEControlCodeToTransaction.cpp        |   6 +-
 .../Transforms/AMDAIEFoldDmaWaits.cpp         | 198 ++++++++++++++++
 .../iree-amd-aie/Transforms/CMakeLists.txt    |   1 +
 .../iree-amd-aie/Transforms/PassDetail.h      |   1 +
 .../iree-amd-aie/Transforms/Passes.cpp        |   1 +
 .../AMD-AIE/iree-amd-aie/Transforms/Passes.h  |   3 +
 .../AMD-AIE/iree-amd-aie/Transforms/Passes.td |   6 +
 .../Transforms/test/CMakeLists.txt            |   1 +
 .../test/controlcode_to_transaction.mlir      |   4 +-
 .../Transforms/test/fold_dma_waits.mlir       | 222 ++++++++++++++++++
 .../aie_runtime/iree_aie_runtime.cc           |   8 +
 .../aie_runtime/iree_aie_runtime.h            |   2 +
 12 files changed, 448 insertions(+), 5 deletions(-)
 create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
 create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp
index db8477976..665ea08a8 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp
@@ -220,9 +220,9 @@ LogicalResult convertOp(AMDAIE::NpuDmaWaitOp op, TransactionBuilder &builder) {
 LogicalResult convertOp(AMDAIE::NpuPushToQueueOp op,
                         TransactionBuilder &builder) {
   uint32_t repeatCount = op.getRepeatCount() - 1;
-  if (failed(builder.appendPushToQueueOp(op.getCol(), op.getRow(),
-                                         op.getDirection(), op.getChannel(),
-                                         op.getBdId(), repeatCount, true))) {
+  if (failed(builder.appendPushToQueueOp(
+          op.getCol(), op.getRow(), op.getDirection(), op.getChannel(),
+          op.getBdId(), repeatCount, static_cast<bool>(op.getAsyncToken())))) {
     return failure();
   }
   return success();
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
new file mode 100644
index 000000000..670edeab4
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
@@ -0,0 +1,198 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree-amd-aie/IR/AMDAIEOps.h"
+#include "iree-amd-aie/Transforms/Passes.h"
+#include "iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h"
+#include "iree-amd-aie/Transforms/Utils/AMDAIEUtils.h"
+#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
+#include "mlir/IR/Iterators.h"
+#define DEBUG_TYPE "iree-amdaie-fold-dma-waits"
+
+namespace mlir::iree_compiler::AMDAIE {
+
+namespace {
+
+/// Utility function to determine whether a DMA wait op can be folded based on
+/// its half DMA copy operation.
+FailureOr<bool> canFoldBasedOnHalfDmaCpy(
+    const AMDAIE::AMDAIEDeviceModel &deviceModel,
+    AMDAIE::NpuHalfDmaCpyNdOp &npuHalfDmaCpyNdOp,
+    DenseMap<std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>,
+             SmallVector<uint32_t>> &tileConnectToBdIdQueue) {
+  // Retrieve the connection op.
+  std::optional<AMDAIE::ConnectionOp> maybeConnectionOp =
+      npuHalfDmaCpyNdOp.getConnectionOp();
+  if (!maybeConnectionOp) {
+    return npuHalfDmaCpyNdOp.emitOpError()
+           << "expected to operate on an `amdaie.connection`";
+  }
+  AMDAIE::ConnectionOp connectionOp = maybeConnectionOp.value();
+
+  // Retrieve the flow op.
+  std::optional<AMDAIE::FlowOp> maybeFlowOp = connectionOp.getFlowOp();
+  if (!maybeFlowOp) {
+    return connectionOp->emitOpError()
+           << "expected to operate on an `amdaie.flow`";
+  }
+  AMDAIE::FlowOp flowOp = maybeFlowOp.value();
+  bool isPacketFlow = flowOp.getIsPacketFlow();
+
+  // Retrieve the BD ID op.
+  std::optional<AMDAIE::BdIdOp> maybeBdIdOp = npuHalfDmaCpyNdOp.getBdIdOp();
+  if (!maybeBdIdOp) {
+    return npuHalfDmaCpyNdOp.emitOpError()
+           << "must have a BD ID op to lower to "
+              "`amdaie.npu.write_bd`";
+  }
+  AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value();
+
+  // Retrieve the tile op.
+  AMDAIE::TileOp tileOp =
+      dyn_cast_if_present<AMDAIE::TileOp>(bdIdOp.getTile().getDefiningOp());
+  if (!tileOp) {
+    return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`";
+  }
+
+  // Get the maximum queue size.
+  uint32_t col = getConstantIndexOrAssert(tileOp.getCol());
+  uint32_t row = getConstantIndexOrAssert(tileOp.getRow());
+  uint32_t maxQueueSize = deviceModel.getDmaMaxQueueSize(col, row);
+
+  // Keep wait op if, either reaches the maximum queue size, or a
+  // duplicate BD ID in the same tile, or packet flow, or the queue is
+  // empty
+  uint32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue());
+  bool isDuplicateBdId =
+      llvm::any_of(tileConnectToBdIdQueue, [&](const auto &entry) {
+        return entry.first.first == tileOp &&
+               llvm::is_contained(entry.second, bdId);
+      });
+  SmallVector<uint32_t> &bdIdQueue =
+      tileConnectToBdIdQueue[{tileOp, connectionOp}];
+  bool canFold = true;
+  if (isDuplicateBdId || isPacketFlow || bdIdQueue.size() >= maxQueueSize ||
+      bdIdQueue.empty()) {
+    bdIdQueue.clear();
+    canFold = false;
+  }
+  bdIdQueue.push_back(bdId);
+  return canFold;
+}
+
+/// Traverses the control code in reverse, ensuring that for each connection,
+/// only one DMA wait op is retained for every maximum queue size.
+///
+/// Example Output: assuming a maximum queue size of 4.
+///   dma_cpy_nd
+///   %0 = dma_cpy_nd
+///   dma_wait(%0)
+///   dma_cpy_nd
+///   dma_cpy_nd
+///   dma_cpy_nd
+///   %1 = dma_cpy_nd
+///   dma_wait(%1)
+/// From the bottom up, for every four DMA copy operations, only one DMA wait
+/// operation is retained.
+///
+/// Reverse traversal simplifies handling duplicate BD IDs, preventing
+/// the need to revisit and modify earlier operations after processing later
+/// ones.
+LogicalResult foldDmaWaits(const AMDAIE::AMDAIEDeviceModel &deviceModel,
+                           AMDAIE::ControlCodeOp controlCodeOp) {
+  IRRewriter rewriter(controlCodeOp->getContext());
+  std::vector<AMDAIE::NpuDmaWaitOp> waitOpsToErase;
+  DenseMap<std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>,
+           SmallVector<uint32_t>>
+      tileConnectToBdIdQueue;
+  // Traverse the control code in reverse.
+  WalkResult res = controlCodeOp->walk<WalkOrder::PostOrder, ReverseIterator>(
+      [&](AMDAIE::NpuDmaWaitOp waitOp) {
+        bool toErase = true;
+        for (Value token : waitOp.getAsyncTokens()) {
+          if (auto npuHalfDmaCpyNdOp =
+                  dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(
+                      token.getDefiningOp())) {
+            FailureOr<bool> result = canFoldBasedOnHalfDmaCpy(
+                deviceModel, npuHalfDmaCpyNdOp, tileConnectToBdIdQueue);
+            if (failed(result)) return WalkResult::interrupt();
+            toErase &= *result;
+          }
+        }
+        // Erase later to avoid invalidating the iterator.
+        if (toErase) waitOpsToErase.push_back(waitOp);
+        return WalkResult::advance();
+      });
+  if (res.wasInterrupted()) return failure();
+
+  for (AMDAIE::NpuDmaWaitOp waitOp : waitOpsToErase) {
+    SmallVector<Value> asyncTokens(waitOp.getAsyncTokens());
+    // Erase the wait op.
+    rewriter.eraseOp(waitOp);
+    for (Value token : asyncTokens) {
+      if (auto op = dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(
+              token.getDefiningOp())) {
+        if (op.use_empty()) {
+          rewriter.setInsertionPoint(op);
+          TypeRange resultTypeRange = TypeRange{};
+          // Nullify the result to avoid issuing a token.
+          rewriter.create<AMDAIE::NpuHalfDmaCpyNdOp>(
+              op.getLoc(), resultTypeRange, op.getConnection(), op.getInput(),
+              op.getMixedOffsets(), op.getMixedSizes(), op.getMixedStrides(),
+              op.getBdId(), op.getChannel());
+          rewriter.eraseOp(op);
+        }
+      }
+    }
+  }
+
+  return success();
+}
+
+class AMDAIEFoldDmaWaitsPass
+    : public impl::AMDAIEFoldDmaWaitsBase<AMDAIEFoldDmaWaitsPass> {
+ public:
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AMDAIEDialect>();
+  }
+
+  AMDAIEFoldDmaWaitsPass() = default;
+  AMDAIEFoldDmaWaitsPass(const AMDAIEFoldDmaWaitsPass &pass){};
+  void runOnOperation() override;
+};
+
+void AMDAIEFoldDmaWaitsPass::runOnOperation() {
+  Operation *parentOp = getOperation();
+
+  auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp);
+  std::optional<AMDAIEDevice> maybeDevice = getConfigAMDAIEDevice(targetAttr);
+  if (!maybeDevice) {
+    parentOp->emitOpError()
+        << "has no AMDAIEDevice in the target attribute configuration. This "
+           "device-specific information is required to fold DMA wait "
+           "ops.";
+    return signalPassFailure();
+  }
+  AMDAIE::AMDAIEDeviceModel deviceModel =
+      AMDAIE::getDeviceModel(maybeDevice.value());
+
+  WalkResult res = parentOp->walk([&](AMDAIE::WorkgroupOp workgroupOp) {
+    AMDAIE::ControlCodeOp controlCodeOp = workgroupOp.getControlCode();
+    if (failed(foldDmaWaits(deviceModel, controlCodeOp))) {
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+  if (res.wasInterrupted()) return signalPassFailure();
+}
+
+}  // namespace
+
+std::unique_ptr<Pass> createAMDAIEFoldDmaWaitsPass() {
+  return std::make_unique<AMDAIEFoldDmaWaitsPass>();
+}
+
+}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
index 0e031e6c1..81f020f00 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
@@ -71,6 +71,7 @@ iree_cc_library(
     "AMDAIEDmaToCircularDma.cpp"
     "AMDAIEFlattenLogicalObjectFifo.cpp"
     "AMDAIELinalgFunctionOutlining.cpp"
+    "AMDAIEFoldDmaWaits.cpp"
     "AMDAIEFuseConsumerIntoLoop.cpp"
     "AMDAIEFuseFillIntoForall.cpp"
     "AMDAIEFusePackIntoLoop.cpp"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
index f921f6d47..6cdf14d1b 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
@@ -52,6 +52,7 @@ namespace mlir::iree_compiler::AMDAIE {
 #define GEN_PASS_DEF_AMDAIEDMATOCIRCULARDMA
 #define GEN_PASS_DEF_AMDAIEFLATTENLOGICALOBJECTFIFO
 #define GEN_PASS_DEF_AMDAIELINALGFUNCTIONOUTLINING
+#define GEN_PASS_DEF_AMDAIEFOLDDMAWAITS
 #define GEN_PASS_DEF_AMDAIEFUSECONSUMERINTOLOOP
 #define GEN_PASS_DEF_AMDAIEFUSEFILLINTOFORALL
 #define GEN_PASS_DEF_AMDAIEFUSEPACKINTOLOOP
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
index 040fa2b59..9ece915fe 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -660,6 +660,7 @@ void addAMDAIEObjectFifoLoweringPasses(
   passManager.addPass(createAMDAIEAssignPacketIdsPass());
 
   passManager.addPass(createAMDAIENpuDmaToHalfDmaCpyNdPass());
+  passManager.addPass(createAMDAIEFoldDmaWaitsPass());
   passManager.addPass(createAMDAIEControlCodeLoweringPass());
   passManager.addPass(createAMDAIEControlCodeToTransactionPass());
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
index cc8a794bc..5fefdf02f 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
@@ -204,6 +204,9 @@ std::unique_ptr<Pass> createAMDAIEHoistLogicalObjFifoPass();
 std::unique_ptr<Pass> createAMDAIEInsertLoopsForVectorizationPass(
     AMDAIEInsertLoopsForVectorizationOptions options = {});
 
+/// Create a pass to remove redundant DMA wait operations.
+std::unique_ptr<Pass> createAMDAIEFoldDmaWaitsPass();
+
 /// Create a pass to fuse the pack operations into the for loops.
 std::unique_ptr<Pass> createAMDAIEFusePackIntoLoopPass(
     AMDAIEFusePackIntoLoopOptions options = {});
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
index f2680ae5e..9d01a5bf5 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
@@ -287,6 +287,12 @@ def AMDAIELinalgFunctionOutlining :
   let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIELinalgFunctionOutliningPass()";
 }
 
+def AMDAIEFoldDmaWaits :
+  Pass<"iree-amdaie-fold-dma-waits", ""> {
+  let summary = "Remove redundant dma wait operations in controlcode.";
+  let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEFoldDmaWaitsPass()";
+}
+
 def AMDAIEFuseConsumerIntoLoop :
     InterfacePass<"iree-amdaie-fuse-consumer-into-loop", "mlir::FunctionOpInterface"> {
   let summary = "Fuse the consumer operation into the innermost last scf loop.";
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
index 7b604e160..5191f1c50 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
@@ -41,6 +41,7 @@ iree_lit_test_suite(
     "dma_loop_subsumption_circular.mlir"
     "dma_loop_subsumption.mlir"
     "dma_to_circular_dma.mlir"
+    "fold_dma_waits.mlir"
     "flatten_logical_objectfifo.mlir"
     "linalg_function_outlining.mlir"
     "fuse_consumer_into_loop.mlir"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir
index f7704d4db..fa83b2028 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir
@@ -75,7 +75,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK:       0x00000000
 // CHECK:       0x0001D214
 // CHECK:       0x00000000
-// CHECK:       0x80000000
+// CHECK:       0x00000000
 // CHECK:       0x00000018
 // CHECK-LABEL: @push_to_queue_default_values
 // CHECK:       npu_instructions = dense_resource<npu_instructions> : tensor<10xui32>
@@ -102,7 +102,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK:       0x00000000
 // CHECK:       0x0601D21C
 // CHECK:       0x00000000
-// CHECK:       0x803F0002
+// CHECK:       0x003F0002
 // CHECK:       0x00000018
 // CHECK-LABEL: @push_to_queue
 // CHECK:       npu_instructions = dense_resource<npu_instructions> : tensor<10xui32>
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir
new file mode 100644
index 000000000..4032221cc
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir
@@ -0,0 +1,222 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-fold-dma-waits)" --split-input-file --verify-diagnostics %s | FileCheck %s
+
+// expected-error @+1 {{op has no AMDAIEDevice in the target attribute configuration}}
+module {
+  func.func @no_amdaie_device() {
+    amdaie.workgroup {
+      amdaie.controlcode {
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @no_ops
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @no_ops() {
+    amdaie.workgroup {
+      amdaie.controlcode {
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// Expect no DMA waits to be folded, since the same BD ID is used.
+// CHECK-LABEL: @fold_dma_waits_same_bd_id
+// CHECK-COUNT-2: amdaie.npu.dma_wait
+// CHECK-NOT:     amdaie.npu.dma_wait
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @fold_dma_waits_same_bd_id() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    amdaie.workgroup {
+      %tile = amdaie.tile(%c0, %c1)
+      %tile_0 = amdaie.tile(%c0, %c0)
+      %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
+      %lock = amdaie.lock(%tile(4), 4)
+      %lock_2 = amdaie.lock(%tile(5), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S)
+      %channel_3 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM)
+      %3 = amdaie.flow({%channel} -> {%channel_3}) {is_packet_flow = false}
+      %4 = amdaie.connection(%0 {%channel_3}, %2 {%channel}, flow = %3) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      amdaie.controlcode {
+        %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %1, 64 : memref<64x32xi32>
+        %bd_id = amdaie.bd_id(%tile_0, %c0)
+        %6 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%6 : !amdaie.async_token)
+        %7 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%7 : !amdaie.async_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// DMA queue has a maximum size of 4. To optimize, starting from 
+// the end of the control code, retain every 4th DMA wait operation 
+// while folding the others.
+// CHECK-LABEL: @fold_dma_waits_max_queue_size
+// CHECK:       %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_buffers
+// CHECK:       %[[CHANNEL_0:.+]] = amdaie.channel
+// CHECK:       %[[CHANNEL_1:.+]] = amdaie.channel
+// CHECK:       %[[CONNECTION:.+]] = amdaie.connection
+// CHECK:         %[[OBJECT_FIFO_1:.+]] = amdaie.logicalobjectfifo.from_memref
+// CHECK:         %[[BD_ID_0:.+]] = amdaie.bd_id
+// CHECK:         %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
+// CHECK:         %[[BD_ID_1:.+]] = amdaie.bd_id
+// CHECK:         amdaie.npu.half_dma_cpy_nd  %[[CONNECTION]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK:         %[[BD_ID_2:.+]] = amdaie.bd_id
+// CHECK:         amdaie.npu.half_dma_cpy_nd  %[[CONNECTION]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID_2]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK:         %[[BD_ID_3:.+]] = amdaie.bd_id
+// CHECK:         amdaie.npu.half_dma_cpy_nd  %[[CONNECTION]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID_3]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK:         %[[BD_ID_4:.+]] = amdaie.bd_id
+// CHECK:         %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID_4]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token)
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @fold_dma_waits_max_queue_size() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+    %c4 = arith.constant 4 : index
+    amdaie.workgroup {
+      %tile = amdaie.tile(%c0, %c1)
+      %tile_0 = amdaie.tile(%c0, %c0)
+      %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
+      %lock = amdaie.lock(%tile(4), 4)
+      %lock_2 = amdaie.lock(%tile(5), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S)
+      %channel_3 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM)
+      %3 = amdaie.flow({%channel} -> {%channel_3}) {is_packet_flow = false}
+      %4 = amdaie.connection(%0 {%channel_3}, %2 {%channel}, flow = %3) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      amdaie.controlcode {
+        %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %1, 64 : memref<64x32xi32>
+        %bd_id = amdaie.bd_id(%tile_0, %c0)
+        %6 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%6 : !amdaie.async_token)
+        %bd_id_1 = amdaie.bd_id(%tile_0, %c1)
+        %7 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_1 channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%7 : !amdaie.async_token)
+        %bd_id_2 = amdaie.bd_id(%tile_0, %c2)
+        %8 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_2 channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%8 : !amdaie.async_token)
+        %bd_id_3 = amdaie.bd_id(%tile_0, %c3)
+        %9 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_3 channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%9 : !amdaie.async_token)
+        %bd_id_4 = amdaie.bd_id(%tile_0, %c4)
+        %10 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_4 channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%10 : !amdaie.async_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// Two circuit connections are used, corresponding to two separate channels.
+// Each channel operates with its own independent queue.
+// CHECK-LABEL: @fold_dma_waits_two_connections
+// CHECK:       %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_buffers
+// CHECK:       %[[OBJECT_FIFO_1:.+]] = amdaie.logicalobjectfifo.from_buffers
+// CHECK:       %[[CHANNEL_0:.+]] = amdaie.channel
+// CHECK:       %[[CHANNEL_1:.+]] = amdaie.channel
+// CHECK:       %[[CHANNEL_2:.+]] = amdaie.channel
+// CHECK:       %[[CHANNEL_3:.+]] = amdaie.channel
+// CHECK:       %[[CONNECTION_0:.+]] = amdaie.connection
+// CHECK:       %[[CONNECTION_1:.+]] = amdaie.connection
+// CHECK:         %[[OBJECT_FIFO_2:.+]] = amdaie.logicalobjectfifo.from_memref
+// CHECK:         %[[OBJECT_FIFO_3:.+]] = amdaie.logicalobjectfifo.from_memref
+// CHECK:         %[[BD_ID_0:.+]] = amdaie.bd_id
+// CHECK:         amdaie.npu.half_dma_cpy_nd  %[[CONNECTION_0]](%[[OBJECT_FIFO_2]] [] [] [] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK:         %[[BD_ID_1:.+]] = amdaie.bd_id
+// CHECK:         amdaie.npu.half_dma_cpy_nd  %[[CONNECTION_1]](%[[OBJECT_FIFO_3]] [] [] [] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL_2]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK:         %[[BD_ID_2:.+]] = amdaie.bd_id
+// CHECK:         %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[OBJECT_FIFO_2]] [] [] [] bd_id = %[[BD_ID_2]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
+// CHECK:         %[[BD_ID_3:.+]] = amdaie.bd_id
+// CHECK:         %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_1]](%[[OBJECT_FIFO_3]] [] [] [] bd_id = %[[BD_ID_3]] channel = %[[CHANNEL_2]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token)
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @fold_dma_waits_two_connections() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+    amdaie.workgroup {
+      %tile = amdaie.tile(%c0, %c1)
+      %tile_0 = amdaie.tile(%c0, %c0)
+      %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
+      %buffer_2 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
+      %buffer_3 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
+      %lock = amdaie.lock(%tile(4), 4)
+      %lock_4 = amdaie.lock(%tile(5), 0)
+      %lock_5 = amdaie.lock(%tile(6), 4)
+      %lock_6 = amdaie.lock(%tile(7), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_4}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_2, %buffer_3}, {%lock_5}, {%lock_6}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %5 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S)
+      %channel_7 = amdaie.channel(%tile_0, 1, port_type = DMA, direction = MM2S)
+      %channel_8 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM)
+      %channel_9 = amdaie.channel(%tile, 1, port_type = DMA, direction = S2MM) 
+      %6 = amdaie.flow({%channel} -> {%channel_7}) {is_packet_flow = false}
+      %7 = amdaie.flow({%channel_8} -> {%channel_9}) {is_packet_flow = false}
+      %8 = amdaie.connection(%0 {%channel_7}, %2 {%channel}, flow = %6) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      %9 = amdaie.connection(%3 {%channel_9}, %5 {%channel_8}, flow = %7) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      amdaie.controlcode {
+        %10 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %1, 64 : memref<64x32xi32>
+        %11 = amdaie.logicalobjectfifo.from_memref %4, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %4, 64 : memref<64x32xi32>
+        %bd_id = amdaie.bd_id(%tile_0, %c0)
+        %12 = amdaie.npu.half_dma_cpy_nd async %8(%10 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%12 : !amdaie.async_token)
+        %bd_id_1 = amdaie.bd_id(%tile_0, %c1)
+        %13 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_1 channel = %channel_8) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%13 : !amdaie.async_token)
+        %bd_id_2 = amdaie.bd_id(%tile_0, %c2)
+        %14 = amdaie.npu.half_dma_cpy_nd async %8(%10 [] [] [] bd_id = %bd_id_2 channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%14 : !amdaie.async_token)
+        %bd_id_3 = amdaie.bd_id(%tile_0, %c3)
+        %15 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_3 channel = %channel_8) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%15 : !amdaie.async_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc
index 58d068743..bc5bca39d 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc
+++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc
@@ -225,6 +225,14 @@ bool AMDAIEDeviceModel::isShimTile(uint8_t col, uint8_t row) const {
   return row == configPtr.ShimRowNum;
 }
 
+uint8_t AMDAIEDeviceModel::getDmaMaxQueueSize(uint8_t col, uint8_t row) const {
+  uint8_t maxQueueSize = 0;
+  TRY_XAIE_API_FATAL_ERROR(XAie_DmaGetMaxQueueSize,
+                           const_cast<XAie_DevInst *>(&devInst),
+                           XAie_TileLoc(col, row), &maxQueueSize);
+  return maxQueueSize;
+}
+
 // TODO(max): these should be optionals instead of returning 0.
 uint32_t AMDAIEDeviceModel::getNumLocks(uint8_t col, uint8_t row) const {
   AMDAIETileType tileType = getTileType(col, row);
diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h
index 278a208bc..742eb8a59 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h
+++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h
@@ -315,6 +315,8 @@ struct AMDAIEDeviceModel {
     return *((const T *)(dmaBdMod + static_cast<uint8_t>(dmaBdProp)));
   }
 
+  uint8_t getDmaMaxQueueSize(uint8_t col, uint8_t row) const;
+
   uint32_t getNumLocks(uint8_t col, uint8_t row) const;
 
   std::optional<TileLoc> getMemWest(TileLoc src) const;