diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td index 371945da7..17d2d604d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td @@ -1022,6 +1022,31 @@ def AMDAIE_NpuWriteBdOp: AMDAIE_Op<"npu.write_bd"> { let assemblyFormat = [{ attr-dict }]; } +def AMDAIE_NpuTctSyncOp: AMDAIE_Op<"npu.tct_sync"> { + let summary = "Wait for the TCTs to be emitted."; + let description = [{ + This NPU controller operation to synchronize the Task Completion Tokens (TCTs) + on the specified `channel` and `direction`. The ranges of tiles to synchronize + are defined by [col, col+col_num) and [row, row+row_num). + + Example: + + ```mlir + amdaie.npu.tct_sync {col = 0 : ui32, row = 0 : ui32, channel = 0 : ui32, + direction = 1 : i32, col_num = 1 : ui32, row_num = 1 : ui32} + ``` + }]; + let arguments = ( + ins UI32Attr:$col, + UI32Attr:$row, + DMAChannelDir:$direction, + UI32Attr:$channel, + UI32Attr:$col_num, + UI32Attr:$row_num + ); + let assemblyFormat = [{ attr-dict }]; +} + //===----------------------------------------------------------------------===// // IREE AMDAIE LogicalObjectFifo Ops //===----------------------------------------------------------------------===// diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir index c542e2627..174dd890b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir @@ -454,6 +454,15 @@ func.func @npu_write_bd() { // ----- +// CHECK-LABEL: func.func @npu_tct_sync +// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 2 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32} +func.func @npu_tct_sync() { + amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 2 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32} + return +} + +// ----- + // CHECK-LABEL: func.func @workgroup // CHECK: amdaie.workgroup // CHECK: amdaie.core diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp index cf40de2b0..0687429a6 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp @@ -236,6 +236,62 @@ struct HalfDmaCpyNdToNpuConverter final uint8_t minStrideBitWidth; }; +struct DmaWaitToTctSyncConverter final + : OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite( + AMDAIE::NpuDmaWaitOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + LLVM_DEBUG(llvm::dbgs() << "matchAndRewrite[AMDAIE::NpuDmaWaitOp]\n"); + // Collect all half DMA ops from the async tokens. + SmallVector pushToQueueOps; + for (Value asyncToken : op.getAsyncTokens()) { + auto pushToQueueOp = dyn_cast_if_present( + asyncToken.getDefiningOp()); + if (!pushToQueueOp) { + return op.emitOpError() + << "should operate on an `amdaie.push_to_queue` op async token"; + } + pushToQueueOps.push_back(pushToQueueOp); + } + // Sort the half DMA ops by direction, channel, row, and column. + std::sort(pushToQueueOps.begin(), pushToQueueOps.end(), + [](AMDAIE::NpuPushToQueueOp a, AMDAIE::NpuPushToQueueOp b) { + return std::make_tuple(a.getDirection(), a.getChannel(), + a.getRow(), a.getCol()) < + std::make_tuple(b.getDirection(), b.getChannel(), + b.getRow(), b.getCol()); + }); + // Batch DMA operations with the same row, channel, and direction into a + // single TCT sync operation, as long as they have consecutive columns. + llvm::MapVector columnBatches; + for (auto pushToQueueOp : pushToQueueOps) { + if (!columnBatches.empty()) { + auto &[lastPushOp, lastColNum] = columnBatches.back(); + if (lastPushOp.getRow() == pushToQueueOp.getRow() && + lastPushOp.getCol() + lastColNum == pushToQueueOp.getCol() && + lastPushOp.getDirection() == pushToQueueOp.getDirection() && + lastPushOp.getChannel() == pushToQueueOp.getChannel()) { + ++lastColNum; + continue; + } + } + columnBatches.insert({pushToQueueOp, 1}); + } + // Convert to TCT sync ops. + for (auto &[pushToQueueOp, colNum] : columnBatches) { + uint32_t rowNum = 1; + rewriter.create( + op.getLoc(), pushToQueueOp.getCol(), pushToQueueOp.getRow(), + pushToQueueOp.getDirection(), pushToQueueOp.getChannel(), colNum, + rowNum); + } + rewriter.eraseOp(op); + return success(); + } +}; + namespace { class AMDAIEControlCodeLoweringPass : public impl::AMDAIEControlCodeLoweringBase< @@ -260,17 +316,37 @@ void AMDAIEControlCodeLoweringPass::runOnOperation() { "ops."; return signalPassFailure(); } - AMDAIE::AMDAIEDeviceModel deviceModel = - AMDAIE::getDeviceModel(maybeDevice.value()); - RewritePatternSet patterns(context); - ConversionTarget conversionTarget(*context); - conversionTarget.addLegalDialect(); - conversionTarget.addIllegalOp(); - patterns.insert(context, deviceModel); - if (failed(applyPartialConversion(parentOp, conversionTarget, - std::move(patterns)))) { - return signalPassFailure(); + // First conversion: HalfDmaCpyNdOp to WriteBdOp, AddressPatchOp and + // PushToQueueOp. + { + AMDAIE::AMDAIEDeviceModel deviceModel = + AMDAIE::getDeviceModel(maybeDevice.value()); + RewritePatternSet patterns(context); + ConversionTarget conversionTarget(*context); + conversionTarget.addLegalDialect(); + conversionTarget.addIllegalOp(); + patterns.insert(context, deviceModel); + + if (failed(applyPartialConversion(parentOp, conversionTarget, + std::move(patterns)))) { + return signalPassFailure(); + } + } + + // Second conversion: DmaWaitOp to TctSyncOp. + // The two conversions are separate to simplify the attribute handling, such + // as col, row, direction, channel, etc. + { + RewritePatternSet patterns(context); + ConversionTarget conversionTarget(*context); + conversionTarget.addLegalDialect(); + conversionTarget.addIllegalOp(); + patterns.insert(context); + if (failed(applyPartialConversion(parentOp, conversionTarget, + std::move(patterns)))) { + return signalPassFailure(); + } } } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp index 0c1cf7ef9..421900d6a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp @@ -199,50 +199,11 @@ LogicalResult convertOp(AMDAIE::NpuAddressPatchOp op, return success(); } -LogicalResult convertOp(AMDAIE::NpuDmaWaitOp op, TransactionBuilder &builder) { - // Collect all half DMA ops from the async tokens. - SmallVector pushToQueueOps; - for (Value asyncToken : op.getAsyncTokens()) { - auto pushToQueueOp = dyn_cast_if_present( - asyncToken.getDefiningOp()); - if (!pushToQueueOp) { - return op.emitOpError() - << "should operate on an `amdaie.push_to_queue` op async token"; - } - pushToQueueOps.push_back(pushToQueueOp); - } - // Sort the half DMA ops by channel, direction, row, and column. - std::sort(pushToQueueOps.begin(), pushToQueueOps.end(), - [](AMDAIE::NpuPushToQueueOp a, AMDAIE::NpuPushToQueueOp b) { - return std::make_tuple(a.getChannel(), a.getDirection(), - a.getRow(), a.getCol()) < - std::make_tuple(b.getChannel(), b.getDirection(), - b.getRow(), b.getCol()); - }); - // Batch DMA operations with the same row, channel, and direction into a - // single TCT sync operation, as long as they have consecutive columns. - llvm::MapVector columnBatches; - for (auto pushToQueueOp : pushToQueueOps) { - if (!columnBatches.empty()) { - auto &[lastPushOp, lastColNum] = columnBatches.back(); - if (lastPushOp.getRow() == pushToQueueOp.getRow() && - lastPushOp.getCol() + lastColNum == pushToQueueOp.getCol() && - lastPushOp.getDirection() == pushToQueueOp.getDirection() && - lastPushOp.getChannel() == pushToQueueOp.getChannel()) { - ++lastColNum; - continue; - } - } - columnBatches.insert({pushToQueueOp, 1}); - } - // Convert to TCT sync ops. - for (auto &[pushToQueueOp, colNum] : columnBatches) { - if (failed(builder.appendTCTSync( - pushToQueueOp.getCol(), pushToQueueOp.getRow(), - static_cast(pushToQueueOp.getDirection()), 1, colNum, - pushToQueueOp.getChannel()))) { - return failure(); - } +LogicalResult convertOp(AMDAIE::NpuTctSyncOp op, TransactionBuilder &builder) { + if (failed(builder.appendTCTSync( + op.getCol(), op.getRow(), static_cast(op.getDirection()), + op.getRowNum(), op.getColNum(), op.getChannel()))) { + return failure(); } return success(); } @@ -304,7 +265,7 @@ LogicalResult controlCodeToTransaction(IRRewriter &rewriter, WalkResult res = controlCodeOp->walk([&](Operation *op) { LogicalResult switchResult = TypeSwitch(op) - .Case( [&](auto npuOp) { if (failed(convertOp(npuOp, builder))) return failure(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir index 26bad8b3b..74150676a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir @@ -62,13 +62,13 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 2048 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} // CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} -// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) +// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32} %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%6 : !amdaie.async_token) // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} // CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32} -// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token) +// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32} %7 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%7 : !amdaie.async_token) amdaie.end @@ -113,13 +113,13 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} // CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} -// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) +// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32} %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%6 : !amdaie.async_token) // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 512 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 64 : ui32} // CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32} -// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token) +// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32} %7 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 32] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%7 : !amdaie.async_token) amdaie.end @@ -169,7 +169,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: amdaie.npu.write_bd {bd_id = 2 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 2 : ui32, col = 0 : ui32, offset = 0 : ui32} // CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32} -// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) +// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32} %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id_2 channel = %channel start_bd = %bd_id) : !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%6 : !amdaie.async_token) amdaie.end @@ -178,3 +178,203 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} return } } + +// ----- + +// Expect four `push_to_queue` operations on the same `row`, `direction`, and `channel` +// but with different `col` values. The order of the `col` values is 0, 3, 2, 1. +// After sorting the `col` values, the batched `dma_wait` operation will be converted to +// a single `tct_sync` operation, with the `col` set to 0 and `col_num` set to 4. +// CHECK-LABEL: @batched_dma_wait_with_same_row_channel_direction +// CHECK: amdaie.controlcode +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @batched_dma_wait_with_same_row_channel_direction() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + amdaie.workgroup { + %tile_0_1 = amdaie.tile(%c0, %c1) + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_1_1 = amdaie.tile(%c1, %c1) + %tile_1_0 = amdaie.tile(%c1, %c0) + %tile_2_1 = amdaie.tile(%c2, %c1) + %tile_2_0 = amdaie.tile(%c2, %c0) + %tile_3_1 = amdaie.tile(%c3, %c1) + %tile_3_0 = amdaie.tile(%c3, %c0) + %buffer = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> + %buffer_0 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32> + %buffer_2 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32> + %buffer_3 = amdaie.buffer(%tile_2_1) : memref<2048xi32, 1 : i32> + %buffer_4 = amdaie.buffer(%tile_2_1) : memref<2048xi32, 1 : i32> + %buffer_5 = amdaie.buffer(%tile_3_1) : memref<2048xi32, 1 : i32> + %buffer_6 = amdaie.buffer(%tile_3_1) : memref<2048xi32, 1 : i32> + %lock = amdaie.lock(%tile_0_1(4), 4) + %lock_7 = amdaie.lock(%tile_0_1(5), 0) + %lock_8 = amdaie.lock(%tile_1_1(4), 4) + %lock_9 = amdaie.lock(%tile_1_1(5), 0) + %lock_10 = amdaie.lock(%tile_2_1(4), 4) + %lock_11 = amdaie.lock(%tile_2_1(5), 0) + %lock_12 = amdaie.lock(%tile_3_1(4), 4) + %lock_13 = amdaie.lock(%tile_3_1(5), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_0}, {%lock}, {%lock_7}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_1, %buffer_2}, {%lock_8}, {%lock_9}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %5 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.from_buffers({%buffer_3, %buffer_4}, {%lock_10}, {%lock_11}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %8 = amdaie.logicalobjectfifo.placeholder{%tile_2_0} : !amdaie.logicalobjectfifo> + %9 = amdaie.logicalobjectfifo.from_buffers({%buffer_5, %buffer_6}, {%lock_12}, {%lock_13}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %10 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %11 = amdaie.logicalobjectfifo.placeholder{%tile_3_0} : !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) + %channel_14 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) + %channel_15 = amdaie.channel(%tile_1_0, 0, port_type = DMA, direction = MM2S) + %channel_16 = amdaie.channel(%tile_1_1, 0, port_type = DMA, direction = S2MM) + %channel_17 = amdaie.channel(%tile_2_0, 0, port_type = DMA, direction = MM2S) + %channel_18 = amdaie.channel(%tile_2_1, 0, port_type = DMA, direction = S2MM) + %channel_19 = amdaie.channel(%tile_3_0, 0, port_type = DMA, direction = MM2S) + %channel_20 = amdaie.channel(%tile_3_1, 0, port_type = DMA, direction = S2MM) + %12 = amdaie.flow({%channel} -> {%channel_14}) {is_packet_flow = false} + %13 = amdaie.connection(%0 {%channel_14}, %2 {%channel}, flow = %12) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %14 = amdaie.flow({%channel_15} -> {%channel_16}) {is_packet_flow = false} + %15 = amdaie.connection(%3 {%channel_16}, %5 {%channel_15}, flow = %14) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %16 = amdaie.flow({%channel_17} -> {%channel_18}) {is_packet_flow = false} + %17 = amdaie.connection(%6 {%channel_18}, %8 {%channel_17}, flow = %16) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %18 = amdaie.flow({%channel_19} -> {%channel_20}) {is_packet_flow = false} + %19 = amdaie.connection(%9 {%channel_20}, %11 {%channel_19}, flow = %18) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %20 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %1, 64 : memref<64x32xi32> + %21 = amdaie.logicalobjectfifo.from_memref %4, {%tile_1_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %4, 64 : memref<64x32xi32> + %22 = amdaie.logicalobjectfifo.from_memref %7, {%tile_2_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %7, 64 : memref<64x32xi32> + %23 = amdaie.logicalobjectfifo.from_memref %10, {%tile_3_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %10, 64 : memref<64x32xi32> + %bd_id = amdaie.bd_id(%tile_0_0, %c0) +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + %24 = amdaie.npu.half_dma_cpy_nd async %13(%20 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> + %bd_id_21 = amdaie.bd_id(%tile_3_0, %c0) +// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 3 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + %25 = amdaie.npu.half_dma_cpy_nd async %19(%23 [] [] [] bd_id = %bd_id_21 channel = %channel_19) : !amdaie.logicalobjectfifo> + %bd_id_22 = amdaie.bd_id(%tile_2_0, %c0) +// CHECK: %[[TOKEN_2:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + %26 = amdaie.npu.half_dma_cpy_nd async %17(%22 [] [] [] bd_id = %bd_id_22 channel = %channel_17) : !amdaie.logicalobjectfifo> + %bd_id_23 = amdaie.bd_id(%tile_1_0, %c0) +// CHECK: %[[TOKEN_3:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 1 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + %27 = amdaie.npu.half_dma_cpy_nd async %15(%21 [] [] [] bd_id = %bd_id_23 channel = %channel_15) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 4 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32} + amdaie.npu.dma_wait(%24, %25, %26, %27 : !amdaie.async_token, !amdaie.async_token, !amdaie.async_token, !amdaie.async_token) + amdaie.end + } + } + return + } +} + +// ----- + + +// The batched `dma_wait` operation will be converted to four `tct_sync` operations, +// which operate on different `directoin` and `channel` values. +// CHECK-LABEL: @batched_dma_wait_with_diff_row_channel_direction +// CHECK: amdaie.controlcode +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @batched_dma_wait_with_diff_row_channel_direction() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + amdaie.workgroup { + %tile_0_1 = amdaie.tile(%c0, %c1) + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_1_1 = amdaie.tile(%c1, %c1) + %tile_1_0 = amdaie.tile(%c1, %c0) + %tile_2_1 = amdaie.tile(%c2, %c1) + %tile_2_0 = amdaie.tile(%c2, %c0) + %tile_3_1 = amdaie.tile(%c3, %c1) + %tile_3_0 = amdaie.tile(%c3, %c0) + %buffer = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> + %buffer_0 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32> + %buffer_2 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32> + %buffer_3 = amdaie.buffer(%tile_2_1) : memref<2048xi32, 1 : i32> + %buffer_4 = amdaie.buffer(%tile_2_1) : memref<2048xi32, 1 : i32> + %buffer_5 = amdaie.buffer(%tile_3_1) : memref<2048xi32, 1 : i32> + %buffer_6 = amdaie.buffer(%tile_3_1) : memref<2048xi32, 1 : i32> + %lock = amdaie.lock(%tile_0_1(4), 4) + %lock_7 = amdaie.lock(%tile_0_1(5), 0) + %lock_8 = amdaie.lock(%tile_1_1(4), 4) + %lock_9 = amdaie.lock(%tile_1_1(5), 0) + %lock_10 = amdaie.lock(%tile_2_1(4), 4) + %lock_11 = amdaie.lock(%tile_2_1(5), 0) + %lock_12 = amdaie.lock(%tile_3_1(4), 4) + %lock_13 = amdaie.lock(%tile_3_1(5), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_0}, {%lock}, {%lock_7}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<64x32xi32> + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_1, %buffer_2}, {%lock_8}, {%lock_9}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<64x32xi32> + %5 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.from_buffers({%buffer_3, %buffer_4}, {%lock_10}, {%lock_11}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %8 = amdaie.logicalobjectfifo.placeholder{%tile_2_0} : !amdaie.logicalobjectfifo> + %9 = amdaie.logicalobjectfifo.from_buffers({%buffer_5, %buffer_6}, {%lock_12}, {%lock_13}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %10 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %11 = amdaie.logicalobjectfifo.placeholder{%tile_3_0} : !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = S2MM) + %channel_14 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S) + %channel_15 = amdaie.channel(%tile_1_0, 1, port_type = DMA, direction = S2MM) + %channel_16 = amdaie.channel(%tile_1_1, 1, port_type = DMA, direction = MM2S) + %channel_17 = amdaie.channel(%tile_2_0, 0, port_type = DMA, direction = MM2S) + %channel_18 = amdaie.channel(%tile_2_1, 0, port_type = DMA, direction = S2MM) + %channel_19 = amdaie.channel(%tile_3_0, 1, port_type = DMA, direction = MM2S) + %channel_20 = amdaie.channel(%tile_3_1, 1, port_type = DMA, direction = S2MM) + %12 = amdaie.flow({%channel_14} -> {%channel}) {is_packet_flow = false} + %13 = amdaie.connection(%2 {%channel}, %0 {%channel_14}, flow = %12) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo, 2>) + %14 = amdaie.flow({%channel_16} -> {%channel_15}) {is_packet_flow = false} + %15 = amdaie.connection(%5 {%channel_15}, %3 {%channel_16}, flow = %14) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo, 2>) + %16 = amdaie.flow({%channel_17} -> {%channel_18}) {is_packet_flow = false} + %17 = amdaie.connection(%6 {%channel_18}, %8 {%channel_17}, flow = %16) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %18 = amdaie.flow({%channel_19} -> {%channel_20}) {is_packet_flow = false} + %19 = amdaie.connection(%9 {%channel_20}, %11 {%channel_19}, flow = %18) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %20 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %1, 64 : memref<64x32xi32> + %21 = amdaie.logicalobjectfifo.from_memref %4, {%tile_1_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %4, 64 : memref<64x32xi32> + %22 = amdaie.logicalobjectfifo.from_memref %7, {%tile_2_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %7, 64 : memref<64x32xi32> + %23 = amdaie.logicalobjectfifo.from_memref %10, {%tile_3_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %10, 64 : memref<64x32xi32> + %bd_id = amdaie.bd_id(%tile_0_0, %c0) +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 0 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + %24 = amdaie.npu.half_dma_cpy_nd async %13(%20 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> + %bd_id_21 = amdaie.bd_id(%tile_1_0, %c0) +// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 1 : ui32, col = 1 : ui32, direction = 0 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + %25 = amdaie.npu.half_dma_cpy_nd async %15(%21 [] [] [] bd_id = %bd_id_21 channel = %channel_15) : !amdaie.logicalobjectfifo> + %bd_id_22 = amdaie.bd_id(%tile_2_0, %c0) +// CHECK: %[[TOKEN_2:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + %26 = amdaie.npu.half_dma_cpy_nd async %17(%22 [] [] [] bd_id = %bd_id_22 channel = %channel_17) : !amdaie.logicalobjectfifo> + %bd_id_23 = amdaie.bd_id(%tile_3_0, %c0) +// CHECK: %[[TOKEN_3:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 1 : ui32, col = 3 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + %27 = amdaie.npu.half_dma_cpy_nd async %19(%23 [] [] [] bd_id = %bd_id_23 channel = %channel_19) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 0 : i32, row = 0 : ui32, row_num = 1 : ui32} +// CHECK: amdaie.npu.tct_sync {channel = 1 : ui32, col = 1 : ui32, col_num = 1 : ui32, direction = 0 : i32, row = 0 : ui32, row_num = 1 : ui32} +// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 2 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32} +// CHECK: amdaie.npu.tct_sync {channel = 1 : ui32, col = 3 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32} + amdaie.npu.dma_wait(%24, %25, %26, %27 : !amdaie.async_token, !amdaie.async_token, !amdaie.async_token, !amdaie.async_token) + amdaie.end + } + } + return + } +} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir index f36ad7fa2..057ffebd5 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir @@ -121,6 +121,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- +// `tct_sync` on the single column. // CHECK: 0x06030100 // CHECK: 0x00000105 // CHECK: 0x00000002 @@ -135,15 +136,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: 0x00000010 // CHECK: 0x00020001 // CHECK: 0x00010100 -// CHECK-LABEL: @async_push_to_queue_and_wait +// CHECK-LABEL: @tct_sync_single_column // CHECK: npu_instructions = dense_resource : tensor<14xui32> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @async_push_to_queue_and_wait() { + func.func @tct_sync_single_column() { amdaie.workgroup { amdaie.controlcode { %0 = amdaie.npu.push_to_queue async {bd_id = 15 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 1 : i32, repeat_count = 256 : ui32, row = 0 : ui32} - amdaie.npu.dma_wait(%0 : !amdaie.async_token) + amdaie.npu.tct_sync {channel = 0 : ui32, col = 2 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32} amdaie.end } } @@ -153,8 +154,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -// Same channel, direction, and row, but different col. -// Expect one TCT sync operation (0x00000080), with col_num = 4. +// Expect one `tct_sync` to cover four columns, with same channel, direction, and row. // CHECK: 0x06030100 // CHECK: 0x00000105 // CHECK: 0x00000005 @@ -187,85 +187,18 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: 0x00000010 // CHECK: 0x00000001 // CHECK: 0x00040100 -// CHECK-LABEL: @async_push_to_queue_and_wait_col_num +// CHECK-LABEL: @tct_sync_muliple_columns // CHECK: npu_instructions = dense_resource : tensor<32xui32> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @async_push_to_queue_and_wait_col_num() { + func.func @tct_sync_muliple_columns() { amdaie.workgroup { amdaie.controlcode { %0 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} %1 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 3 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} %2 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} %3 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 1 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} - amdaie.npu.dma_wait(%0, %1, %2, %3 : !amdaie.async_token, !amdaie.async_token, !amdaie.async_token, !amdaie.async_token) - amdaie.end - } - } - return - } -} - -// ----- - -// Completely different channels, directions, rows, and cols. -// Expect four TCT sync operations (0x00000080). -// CHECK: 0x06030100 -// CHECK: 0x00000105 -// CHECK: 0x00000008 -// CHECK: 0x000000B0 -// CHECK: 0x00000000 -// CHECK: 0x00000000 -// CHECK: 0x0001D214 -// CHECK: 0x00000000 -// CHECK: 0x80000000 -// CHECK: 0x00000018 -// CHECK: 0x00000000 -// CHECK: 0x00000000 -// CHECK: 0x0201D21C -// CHECK: 0x00000000 -// CHECK: 0x80000000 -// CHECK: 0x00000018 -// CHECK: 0x00000000 -// CHECK: 0x00000000 -// CHECK: 0x0401D204 -// CHECK: 0x00000000 -// CHECK: 0x80000000 -// CHECK: 0x00000018 -// CHECK: 0x00000000 -// CHECK: 0x00000000 -// CHECK: 0x0601D20C -// CHECK: 0x00000000 -// CHECK: 0x80000000 -// CHECK: 0x00000018 -// CHECK: 0x00000080 -// CHECK: 0x00000010 -// CHECK: 0x00020000 -// CHECK: 0x00010100 -// CHECK: 0x00000080 -// CHECK: 0x00000010 -// CHECK: 0x00000001 -// CHECK: 0x00010100 -// CHECK: 0x00000080 -// CHECK: 0x00000010 -// CHECK: 0x00030000 -// CHECK: 0x01010100 -// CHECK: 0x00000080 -// CHECK: 0x00000010 -// CHECK: 0x00010001 -// CHECK: 0x01010100 -// CHECK-LABEL: @wait_different_row_col_channel_direction -// CHECK: npu_instructions = dense_resource : tensor<44xui32> -#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> -module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @wait_different_row_col_channel_direction() { - amdaie.workgroup { - amdaie.controlcode { - %0 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} - %1 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 1 : ui32, col = 1 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} - %2 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 0 : i32, repeat_count = 1 : ui32, row = 0 : ui32} - %3 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 1 : ui32, col = 3 : ui32, direction = 0 : i32, repeat_count = 1 : ui32, row = 0 : ui32} - amdaie.npu.dma_wait(%0, %1, %2, %3 : !amdaie.async_token, !amdaie.async_token, !amdaie.async_token, !amdaie.async_token) + amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 4 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32} amdaie.end } }