diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
index 371945da7..17d2d604d 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
@@ -1022,6 +1022,31 @@ def AMDAIE_NpuWriteBdOp: AMDAIE_Op<"npu.write_bd"> {
   let assemblyFormat = [{ attr-dict }];
 }
 
+def AMDAIE_NpuTctSyncOp: AMDAIE_Op<"npu.tct_sync"> {
+  let summary = "Wait for the TCTs to be emitted.";
+  let description = [{
+    This NPU controller operation to synchronize the Task Completion Tokens (TCTs)
+    on the specified `channel` and `direction`. The ranges of tiles to synchronize 
+    are defined by [col, col+col_num) and [row, row+row_num).
+
+    Example:
+
+    ```mlir
+    amdaie.npu.tct_sync {col = 0 : ui32, row = 0 : ui32, channel = 0 : ui32,
+      direction = 1 : i32, col_num = 1 : ui32, row_num = 1 : ui32}
+    ```
+  }];
+  let arguments = (
+    ins UI32Attr:$col,
+        UI32Attr:$row,
+        DMAChannelDir:$direction,
+        UI32Attr:$channel,
+        UI32Attr:$col_num,
+        UI32Attr:$row_num
+  );
+  let assemblyFormat = [{ attr-dict }];
+}
+
 //===----------------------------------------------------------------------===//
 // IREE AMDAIE LogicalObjectFifo Ops
 //===----------------------------------------------------------------------===//
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir
index c542e2627..174dd890b 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir
@@ -454,6 +454,15 @@ func.func @npu_write_bd() {
 
 // -----
 
+// CHECK-LABEL: func.func @npu_tct_sync
+// CHECK:       amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 2 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
+func.func @npu_tct_sync() {
+  amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 2 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
+  return
+}
+
+// -----
+
 // CHECK-LABEL: func.func @workgroup
 // CHECK: amdaie.workgroup
 // CHECK: amdaie.core
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp
index cf40de2b0..0687429a6 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp
@@ -236,6 +236,62 @@ struct HalfDmaCpyNdToNpuConverter final
   uint8_t minStrideBitWidth;
 };
 
+struct DmaWaitToTctSyncConverter final
+    : OpConversionPattern<AMDAIE::NpuDmaWaitOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      AMDAIE::NpuDmaWaitOp op, OpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
+    LLVM_DEBUG(llvm::dbgs() << "matchAndRewrite[AMDAIE::NpuDmaWaitOp]\n");
+    // Collect all half DMA ops from the async tokens.
+    SmallVector<AMDAIE::NpuPushToQueueOp> pushToQueueOps;
+    for (Value asyncToken : op.getAsyncTokens()) {
+      auto pushToQueueOp = dyn_cast_if_present<AMDAIE::NpuPushToQueueOp>(
+          asyncToken.getDefiningOp());
+      if (!pushToQueueOp) {
+        return op.emitOpError()
+               << "should operate on an `amdaie.push_to_queue` op async token";
+      }
+      pushToQueueOps.push_back(pushToQueueOp);
+    }
+    // Sort the half DMA ops by direction, channel, row, and column.
+    std::sort(pushToQueueOps.begin(), pushToQueueOps.end(),
+              [](AMDAIE::NpuPushToQueueOp a, AMDAIE::NpuPushToQueueOp b) {
+                return std::make_tuple(a.getDirection(), a.getChannel(),
+                                       a.getRow(), a.getCol()) <
+                       std::make_tuple(b.getDirection(), b.getChannel(),
+                                       b.getRow(), b.getCol());
+              });
+    // Batch DMA operations with the same row, channel, and direction into a
+    // single TCT sync operation, as long as they have consecutive columns.
+    llvm::MapVector<AMDAIE::NpuPushToQueueOp, uint32_t> columnBatches;
+    for (auto pushToQueueOp : pushToQueueOps) {
+      if (!columnBatches.empty()) {
+        auto &[lastPushOp, lastColNum] = columnBatches.back();
+        if (lastPushOp.getRow() == pushToQueueOp.getRow() &&
+            lastPushOp.getCol() + lastColNum == pushToQueueOp.getCol() &&
+            lastPushOp.getDirection() == pushToQueueOp.getDirection() &&
+            lastPushOp.getChannel() == pushToQueueOp.getChannel()) {
+          ++lastColNum;
+          continue;
+        }
+      }
+      columnBatches.insert({pushToQueueOp, 1});
+    }
+    // Convert to TCT sync ops.
+    for (auto &[pushToQueueOp, colNum] : columnBatches) {
+      uint32_t rowNum = 1;
+      rewriter.create<AMDAIE::NpuTctSyncOp>(
+          op.getLoc(), pushToQueueOp.getCol(), pushToQueueOp.getRow(),
+          pushToQueueOp.getDirection(), pushToQueueOp.getChannel(), colNum,
+          rowNum);
+    }
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
 namespace {
 class AMDAIEControlCodeLoweringPass
     : public impl::AMDAIEControlCodeLoweringBase<
@@ -260,17 +316,37 @@ void AMDAIEControlCodeLoweringPass::runOnOperation() {
            "ops.";
     return signalPassFailure();
   }
-  AMDAIE::AMDAIEDeviceModel deviceModel =
-      AMDAIE::getDeviceModel(maybeDevice.value());
 
-  RewritePatternSet patterns(context);
-  ConversionTarget conversionTarget(*context);
-  conversionTarget.addLegalDialect<AMDAIEDialect>();
-  conversionTarget.addIllegalOp<AMDAIE::NpuHalfDmaCpyNdOp>();
-  patterns.insert<HalfDmaCpyNdToNpuConverter>(context, deviceModel);
-  if (failed(applyPartialConversion(parentOp, conversionTarget,
-                                    std::move(patterns)))) {
-    return signalPassFailure();
+  // First conversion: HalfDmaCpyNdOp to WriteBdOp, AddressPatchOp and
+  // PushToQueueOp.
+  {
+    AMDAIE::AMDAIEDeviceModel deviceModel =
+        AMDAIE::getDeviceModel(maybeDevice.value());
+    RewritePatternSet patterns(context);
+    ConversionTarget conversionTarget(*context);
+    conversionTarget.addLegalDialect<AMDAIEDialect>();
+    conversionTarget.addIllegalOp<AMDAIE::NpuHalfDmaCpyNdOp>();
+    patterns.insert<HalfDmaCpyNdToNpuConverter>(context, deviceModel);
+
+    if (failed(applyPartialConversion(parentOp, conversionTarget,
+                                      std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+
+  // Second conversion: DmaWaitOp to TctSyncOp.
+  // The two conversions are separate to simplify the attribute handling, such
+  // as col, row, direction, channel, etc.
+  {
+    RewritePatternSet patterns(context);
+    ConversionTarget conversionTarget(*context);
+    conversionTarget.addLegalDialect<AMDAIEDialect>();
+    conversionTarget.addIllegalOp<AMDAIE::NpuDmaWaitOp>();
+    patterns.insert<DmaWaitToTctSyncConverter>(context);
+    if (failed(applyPartialConversion(parentOp, conversionTarget,
+                                      std::move(patterns)))) {
+      return signalPassFailure();
+    }
   }
 }
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp
index 0c1cf7ef9..421900d6a 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp
@@ -199,50 +199,11 @@ LogicalResult convertOp(AMDAIE::NpuAddressPatchOp op,
   return success();
 }
 
-LogicalResult convertOp(AMDAIE::NpuDmaWaitOp op, TransactionBuilder &builder) {
-  // Collect all half DMA ops from the async tokens.
-  SmallVector<AMDAIE::NpuPushToQueueOp> pushToQueueOps;
-  for (Value asyncToken : op.getAsyncTokens()) {
-    auto pushToQueueOp = dyn_cast_if_present<AMDAIE::NpuPushToQueueOp>(
-        asyncToken.getDefiningOp());
-    if (!pushToQueueOp) {
-      return op.emitOpError()
-             << "should operate on an `amdaie.push_to_queue` op async token";
-    }
-    pushToQueueOps.push_back(pushToQueueOp);
-  }
-  // Sort the half DMA ops by channel, direction, row, and column.
-  std::sort(pushToQueueOps.begin(), pushToQueueOps.end(),
-            [](AMDAIE::NpuPushToQueueOp a, AMDAIE::NpuPushToQueueOp b) {
-              return std::make_tuple(a.getChannel(), a.getDirection(),
-                                     a.getRow(), a.getCol()) <
-                     std::make_tuple(b.getChannel(), b.getDirection(),
-                                     b.getRow(), b.getCol());
-            });
-  // Batch DMA operations with the same row, channel, and direction into a
-  // single TCT sync operation, as long as they have consecutive columns.
-  llvm::MapVector<AMDAIE::NpuPushToQueueOp, uint32_t> columnBatches;
-  for (auto pushToQueueOp : pushToQueueOps) {
-    if (!columnBatches.empty()) {
-      auto &[lastPushOp, lastColNum] = columnBatches.back();
-      if (lastPushOp.getRow() == pushToQueueOp.getRow() &&
-          lastPushOp.getCol() + lastColNum == pushToQueueOp.getCol() &&
-          lastPushOp.getDirection() == pushToQueueOp.getDirection() &&
-          lastPushOp.getChannel() == pushToQueueOp.getChannel()) {
-        ++lastColNum;
-        continue;
-      }
-    }
-    columnBatches.insert({pushToQueueOp, 1});
-  }
-  // Convert to TCT sync ops.
-  for (auto &[pushToQueueOp, colNum] : columnBatches) {
-    if (failed(builder.appendTCTSync(
-            pushToQueueOp.getCol(), pushToQueueOp.getRow(),
-            static_cast<uint32_t>(pushToQueueOp.getDirection()), 1, colNum,
-            pushToQueueOp.getChannel()))) {
-      return failure();
-    }
+LogicalResult convertOp(AMDAIE::NpuTctSyncOp op, TransactionBuilder &builder) {
+  if (failed(builder.appendTCTSync(
+          op.getCol(), op.getRow(), static_cast<uint32_t>(op.getDirection()),
+          op.getRowNum(), op.getColNum(), op.getChannel()))) {
+    return failure();
   }
   return success();
 }
@@ -304,7 +265,7 @@ LogicalResult controlCodeToTransaction(IRRewriter &rewriter,
   WalkResult res = controlCodeOp->walk([&](Operation *op) {
     LogicalResult switchResult =
         TypeSwitch<Operation *, LogicalResult>(op)
-            .Case<AMDAIE::NpuAddressPatchOp, AMDAIE::NpuDmaWaitOp,
+            .Case<AMDAIE::NpuAddressPatchOp, AMDAIE::NpuTctSyncOp,
                   AMDAIE::NpuPushToQueueOp, AMDAIE::NpuWriteBdOp>(
                 [&](auto npuOp) {
                   if (failed(convertOp(npuOp, builder))) return failure();
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir
index 26bad8b3b..74150676a 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir
@@ -62,13 +62,13 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 2048 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 2048>, strides = array<i32: 0, 0, 1>, use_next_bd = false, valid_bd = true}
 // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
 // CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
-// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
+// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
         %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
         amdaie.npu.dma_wait(%6 : !amdaie.async_token)
 // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 16>, strides = array<i32: 64, 8, 1>, use_next_bd = false, valid_bd = true}
 // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
 // CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
-// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token)
+// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
         %7 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
         amdaie.npu.dma_wait(%7 : !amdaie.async_token)
         amdaie.end
@@ -113,13 +113,13 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 1024>, strides = array<i32: 0, 0, 1>, use_next_bd = false, valid_bd = true}
 // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
 // CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
-// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
+// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
         %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xbf16>>
         amdaie.npu.dma_wait(%6 : !amdaie.async_token)
 // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 512 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 8>, strides = array<i32: 32, 4, 1>, use_next_bd = false, valid_bd = true}
 // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 64 : ui32}
 // CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
-// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token)
+// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
         %7 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 32] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xbf16>>
         amdaie.npu.dma_wait(%7 : !amdaie.async_token)
         amdaie.end
@@ -169,7 +169,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK: amdaie.npu.write_bd {bd_id = 2 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 16>, strides = array<i32: 64, 8, 1>, use_next_bd = false, valid_bd = true}
 // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 2 : ui32, col = 0 : ui32, offset = 0 : ui32}
 // CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
-// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
+// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
         %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id_2 channel = %channel start_bd = %bd_id) : !amdaie.logicalobjectfifo<memref<2048xi32>>
         amdaie.npu.dma_wait(%6 : !amdaie.async_token)
         amdaie.end
@@ -178,3 +178,203 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     return
   }
 }
+
+// -----
+
+// Expect four `push_to_queue` operations on the same `row`, `direction`, and `channel` 
+// but with different `col` values. The order of the `col` values is 0, 3, 2, 1.
+// After sorting the `col` values, the batched `dma_wait` operation will be converted to
+// a single `tct_sync` operation, with the `col` set to 0 and `col_num` set to 4.
+// CHECK-LABEL: @batched_dma_wait_with_same_row_channel_direction
+// CHECK:       amdaie.controlcode
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @batched_dma_wait_with_same_row_channel_direction() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+    amdaie.workgroup {
+      %tile_0_1 = amdaie.tile(%c0, %c1)
+      %tile_0_0 = amdaie.tile(%c0, %c0)
+      %tile_1_1 = amdaie.tile(%c1, %c1)
+      %tile_1_0 = amdaie.tile(%c1, %c0)
+      %tile_2_1 = amdaie.tile(%c2, %c1)
+      %tile_2_0 = amdaie.tile(%c2, %c0)
+      %tile_3_1 = amdaie.tile(%c3, %c1)
+      %tile_3_0 = amdaie.tile(%c3, %c0)
+      %buffer = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
+      %buffer_0 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32>
+      %buffer_2 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32>
+      %buffer_3 = amdaie.buffer(%tile_2_1) : memref<2048xi32, 1 : i32>
+      %buffer_4 = amdaie.buffer(%tile_2_1) : memref<2048xi32, 1 : i32>
+      %buffer_5 = amdaie.buffer(%tile_3_1) : memref<2048xi32, 1 : i32>
+      %buffer_6 = amdaie.buffer(%tile_3_1) : memref<2048xi32, 1 : i32>
+      %lock = amdaie.lock(%tile_0_1(4), 4)
+      %lock_7 = amdaie.lock(%tile_0_1(5), 0)
+      %lock_8 = amdaie.lock(%tile_1_1(4), 4)
+      %lock_9 = amdaie.lock(%tile_1_1(5), 0)
+      %lock_10 = amdaie.lock(%tile_2_1(4), 4)
+      %lock_11 = amdaie.lock(%tile_2_1(5), 0)
+      %lock_12 = amdaie.lock(%tile_3_1(4), 4)
+      %lock_13 = amdaie.lock(%tile_3_1(5), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_0}, {%lock}, {%lock_7}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_1, %buffer_2}, {%lock_8}, {%lock_9}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %5 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %6 = amdaie.logicalobjectfifo.from_buffers({%buffer_3, %buffer_4}, {%lock_10}, {%lock_11}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %8 = amdaie.logicalobjectfifo.placeholder{%tile_2_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %9 = amdaie.logicalobjectfifo.from_buffers({%buffer_5, %buffer_6}, {%lock_12}, {%lock_13}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %10 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %11 = amdaie.logicalobjectfifo.placeholder{%tile_3_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
+      %channel_14 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM)
+      %channel_15 = amdaie.channel(%tile_1_0, 0, port_type = DMA, direction = MM2S)
+      %channel_16 = amdaie.channel(%tile_1_1, 0, port_type = DMA, direction = S2MM)
+      %channel_17 = amdaie.channel(%tile_2_0, 0, port_type = DMA, direction = MM2S)
+      %channel_18 = amdaie.channel(%tile_2_1, 0, port_type = DMA, direction = S2MM)
+      %channel_19 = amdaie.channel(%tile_3_0, 0, port_type = DMA, direction = MM2S)
+      %channel_20 = amdaie.channel(%tile_3_1, 0, port_type = DMA, direction = S2MM)
+      %12 = amdaie.flow({%channel} -> {%channel_14}) {is_packet_flow = false}
+      %13 = amdaie.connection(%0 {%channel_14}, %2 {%channel}, flow = %12) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      %14 = amdaie.flow({%channel_15} -> {%channel_16}) {is_packet_flow = false}
+      %15 = amdaie.connection(%3 {%channel_16}, %5 {%channel_15}, flow = %14) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      %16 = amdaie.flow({%channel_17} -> {%channel_18}) {is_packet_flow = false}
+      %17 = amdaie.connection(%6 {%channel_18}, %8 {%channel_17}, flow = %16) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      %18 = amdaie.flow({%channel_19} -> {%channel_20}) {is_packet_flow = false}
+      %19 = amdaie.connection(%9 {%channel_20}, %11 {%channel_19}, flow = %18) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      amdaie.controlcode {
+        %20 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %1, 64 : memref<64x32xi32>
+        %21 = amdaie.logicalobjectfifo.from_memref %4, {%tile_1_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %4, 64 : memref<64x32xi32>
+        %22 = amdaie.logicalobjectfifo.from_memref %7, {%tile_2_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %7, 64 : memref<64x32xi32>
+        %23 = amdaie.logicalobjectfifo.from_memref %10, {%tile_3_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %10, 64 : memref<64x32xi32>
+        %bd_id = amdaie.bd_id(%tile_0_0, %c0)
+// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        %24 = amdaie.npu.half_dma_cpy_nd async %13(%20 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %bd_id_21 = amdaie.bd_id(%tile_3_0, %c0)
+// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 3 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        %25 = amdaie.npu.half_dma_cpy_nd async %19(%23 [] [] [] bd_id = %bd_id_21 channel = %channel_19) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %bd_id_22 = amdaie.bd_id(%tile_2_0, %c0)
+// CHECK: %[[TOKEN_2:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        %26 = amdaie.npu.half_dma_cpy_nd async %17(%22 [] [] [] bd_id = %bd_id_22 channel = %channel_17) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %bd_id_23 = amdaie.bd_id(%tile_1_0, %c0)
+// CHECK: %[[TOKEN_3:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 1 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        %27 = amdaie.npu.half_dma_cpy_nd async %15(%21 [] [] [] bd_id = %bd_id_23 channel = %channel_15) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 4 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
+        amdaie.npu.dma_wait(%24, %25, %26, %27 : !amdaie.async_token, !amdaie.async_token, !amdaie.async_token, !amdaie.async_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+
+// The batched `dma_wait` operation will be converted to four `tct_sync` operations,
+// which operate on different `directoin` and `channel` values.
+// CHECK-LABEL: @batched_dma_wait_with_diff_row_channel_direction
+// CHECK:       amdaie.controlcode
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @batched_dma_wait_with_diff_row_channel_direction() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+    amdaie.workgroup {
+      %tile_0_1 = amdaie.tile(%c0, %c1)
+      %tile_0_0 = amdaie.tile(%c0, %c0)
+      %tile_1_1 = amdaie.tile(%c1, %c1)
+      %tile_1_0 = amdaie.tile(%c1, %c0)
+      %tile_2_1 = amdaie.tile(%c2, %c1)
+      %tile_2_0 = amdaie.tile(%c2, %c0)
+      %tile_3_1 = amdaie.tile(%c3, %c1)
+      %tile_3_0 = amdaie.tile(%c3, %c0)
+      %buffer = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
+      %buffer_0 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32>
+      %buffer_2 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32>
+      %buffer_3 = amdaie.buffer(%tile_2_1) : memref<2048xi32, 1 : i32>
+      %buffer_4 = amdaie.buffer(%tile_2_1) : memref<2048xi32, 1 : i32>
+      %buffer_5 = amdaie.buffer(%tile_3_1) : memref<2048xi32, 1 : i32>
+      %buffer_6 = amdaie.buffer(%tile_3_1) : memref<2048xi32, 1 : i32>
+      %lock = amdaie.lock(%tile_0_1(4), 4)
+      %lock_7 = amdaie.lock(%tile_0_1(5), 0)
+      %lock_8 = amdaie.lock(%tile_1_1(4), 4)
+      %lock_9 = amdaie.lock(%tile_1_1(5), 0)
+      %lock_10 = amdaie.lock(%tile_2_1(4), 4)
+      %lock_11 = amdaie.lock(%tile_2_1(5), 0)
+      %lock_12 = amdaie.lock(%tile_3_1(4), 4)
+      %lock_13 = amdaie.lock(%tile_3_1(5), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_0}, {%lock}, {%lock_7}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<64x32xi32>
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_1, %buffer_2}, {%lock_8}, {%lock_9}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<64x32xi32>
+      %5 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %6 = amdaie.logicalobjectfifo.from_buffers({%buffer_3, %buffer_4}, {%lock_10}, {%lock_11}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %8 = amdaie.logicalobjectfifo.placeholder{%tile_2_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %9 = amdaie.logicalobjectfifo.from_buffers({%buffer_5, %buffer_6}, {%lock_12}, {%lock_13}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %10 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %11 = amdaie.logicalobjectfifo.placeholder{%tile_3_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = S2MM)
+      %channel_14 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S)
+      %channel_15 = amdaie.channel(%tile_1_0, 1, port_type = DMA, direction = S2MM)
+      %channel_16 = amdaie.channel(%tile_1_1, 1, port_type = DMA, direction = MM2S)
+      %channel_17 = amdaie.channel(%tile_2_0, 0, port_type = DMA, direction = MM2S)
+      %channel_18 = amdaie.channel(%tile_2_1, 0, port_type = DMA, direction = S2MM)
+      %channel_19 = amdaie.channel(%tile_3_0, 1, port_type = DMA, direction = MM2S)
+      %channel_20 = amdaie.channel(%tile_3_1, 1, port_type = DMA, direction = S2MM)
+      %12 = amdaie.flow({%channel_14} -> {%channel}) {is_packet_flow = false}
+      %13 = amdaie.connection(%2 {%channel}, %0 {%channel_14}, flow = %12) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<64x32xi32>>, !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>)
+      %14 = amdaie.flow({%channel_16} -> {%channel_15}) {is_packet_flow = false}
+      %15 = amdaie.connection(%5 {%channel_15}, %3 {%channel_16}, flow = %14) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<64x32xi32>>, !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>)
+      %16 = amdaie.flow({%channel_17} -> {%channel_18}) {is_packet_flow = false}
+      %17 = amdaie.connection(%6 {%channel_18}, %8 {%channel_17}, flow = %16) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      %18 = amdaie.flow({%channel_19} -> {%channel_20}) {is_packet_flow = false}
+      %19 = amdaie.connection(%9 {%channel_20}, %11 {%channel_19}, flow = %18) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      amdaie.controlcode {
+        %20 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %1, 64 : memref<64x32xi32>
+        %21 = amdaie.logicalobjectfifo.from_memref %4, {%tile_1_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %4, 64 : memref<64x32xi32>
+        %22 = amdaie.logicalobjectfifo.from_memref %7, {%tile_2_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %7, 64 : memref<64x32xi32>
+        %23 = amdaie.logicalobjectfifo.from_memref %10, {%tile_3_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %10, 64 : memref<64x32xi32>
+        %bd_id = amdaie.bd_id(%tile_0_0, %c0)
+// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 0 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        %24 = amdaie.npu.half_dma_cpy_nd async %13(%20 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %bd_id_21 = amdaie.bd_id(%tile_1_0, %c0)
+// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 1 : ui32, col = 1 : ui32, direction = 0 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        %25 = amdaie.npu.half_dma_cpy_nd async %15(%21 [] [] [] bd_id = %bd_id_21 channel = %channel_15) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %bd_id_22 = amdaie.bd_id(%tile_2_0, %c0)
+// CHECK: %[[TOKEN_2:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        %26 = amdaie.npu.half_dma_cpy_nd async %17(%22 [] [] [] bd_id = %bd_id_22 channel = %channel_17) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %bd_id_23 = amdaie.bd_id(%tile_3_0, %c0)
+// CHECK: %[[TOKEN_3:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 1 : ui32, col = 3 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        %27 = amdaie.npu.half_dma_cpy_nd async %19(%23 [] [] [] bd_id = %bd_id_23 channel = %channel_19) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 0 : i32, row = 0 : ui32, row_num = 1 : ui32}
+// CHECK: amdaie.npu.tct_sync {channel = 1 : ui32, col = 1 : ui32, col_num = 1 : ui32, direction = 0 : i32, row = 0 : ui32, row_num = 1 : ui32}
+// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 2 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
+// CHECK: amdaie.npu.tct_sync {channel = 1 : ui32, col = 3 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
+        amdaie.npu.dma_wait(%24, %25, %26, %27 : !amdaie.async_token, !amdaie.async_token, !amdaie.async_token, !amdaie.async_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir
index f36ad7fa2..057ffebd5 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir
@@ -121,6 +121,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
+// `tct_sync` on the single column.
 // CHECK:       0x06030100
 // CHECK:       0x00000105
 // CHECK:       0x00000002
@@ -135,15 +136,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK:       0x00000010
 // CHECK:       0x00020001
 // CHECK:       0x00010100
-// CHECK-LABEL: @async_push_to_queue_and_wait
+// CHECK-LABEL: @tct_sync_single_column
 // CHECK:       npu_instructions = dense_resource<npu_instructions> : tensor<14xui32>
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @async_push_to_queue_and_wait() {
+  func.func @tct_sync_single_column() {
     amdaie.workgroup {
       amdaie.controlcode {
         %0 = amdaie.npu.push_to_queue async {bd_id = 15 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 1 : i32, repeat_count = 256 : ui32, row = 0 : ui32}
-        amdaie.npu.dma_wait(%0 : !amdaie.async_token)
+        amdaie.npu.tct_sync {channel = 0 : ui32, col = 2 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
         amdaie.end
       }
     }
@@ -153,8 +154,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// Same channel, direction, and row, but different col.
-// Expect one TCT sync operation (0x00000080), with col_num = 4.
+// Expect one `tct_sync` to cover four columns, with same channel, direction, and row.
 // CHECK:       0x06030100
 // CHECK:       0x00000105
 // CHECK:       0x00000005
@@ -187,85 +187,18 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK:       0x00000010
 // CHECK:       0x00000001
 // CHECK:       0x00040100
-// CHECK-LABEL: @async_push_to_queue_and_wait_col_num
+// CHECK-LABEL: @tct_sync_muliple_columns
 // CHECK:       npu_instructions = dense_resource<npu_instructions> : tensor<32xui32>
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @async_push_to_queue_and_wait_col_num() {
+  func.func @tct_sync_muliple_columns() {
     amdaie.workgroup {
       amdaie.controlcode {
         %0 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
         %1 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 3 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
         %2 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
         %3 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 1 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
-        amdaie.npu.dma_wait(%0, %1, %2, %3 : !amdaie.async_token, !amdaie.async_token, !amdaie.async_token, !amdaie.async_token)
-        amdaie.end
-      }
-    }
-    return
-  }
-}
-
-// -----
-
-// Completely different channels, directions, rows, and cols.
-// Expect four TCT sync operations (0x00000080).
-// CHECK:       0x06030100
-// CHECK:       0x00000105
-// CHECK:       0x00000008
-// CHECK:       0x000000B0
-// CHECK:       0x00000000
-// CHECK:       0x00000000
-// CHECK:       0x0001D214
-// CHECK:       0x00000000
-// CHECK:       0x80000000
-// CHECK:       0x00000018
-// CHECK:       0x00000000
-// CHECK:       0x00000000
-// CHECK:       0x0201D21C
-// CHECK:       0x00000000
-// CHECK:       0x80000000
-// CHECK:       0x00000018
-// CHECK:       0x00000000
-// CHECK:       0x00000000
-// CHECK:       0x0401D204
-// CHECK:       0x00000000
-// CHECK:       0x80000000
-// CHECK:       0x00000018
-// CHECK:       0x00000000
-// CHECK:       0x00000000
-// CHECK:       0x0601D20C
-// CHECK:       0x00000000
-// CHECK:       0x80000000
-// CHECK:       0x00000018
-// CHECK:       0x00000080
-// CHECK:       0x00000010
-// CHECK:       0x00020000
-// CHECK:       0x00010100
-// CHECK:       0x00000080
-// CHECK:       0x00000010
-// CHECK:       0x00000001
-// CHECK:       0x00010100
-// CHECK:       0x00000080
-// CHECK:       0x00000010
-// CHECK:       0x00030000
-// CHECK:       0x01010100
-// CHECK:       0x00000080
-// CHECK:       0x00000010
-// CHECK:       0x00010001
-// CHECK:       0x01010100
-// CHECK-LABEL: @wait_different_row_col_channel_direction
-// CHECK:       npu_instructions = dense_resource<npu_instructions> : tensor<44xui32>
-#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
-module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @wait_different_row_col_channel_direction() {
-    amdaie.workgroup {
-      amdaie.controlcode {
-        %0 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
-        %1 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 1 : ui32, col = 1 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
-        %2 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 0 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
-        %3 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 1 : ui32, col = 3 : ui32, direction = 0 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
-        amdaie.npu.dma_wait(%0, %1, %2, %3 : !amdaie.async_token, !amdaie.async_token, !amdaie.async_token, !amdaie.async_token)
+        amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 4 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
         amdaie.end
       }
     }