From 3922f33c4798582f1884671ab7e628069c3d55ec Mon Sep 17 00:00:00 2001 From: Yu-Zhewen Date: Mon, 16 Dec 2024 23:27:26 +0000 Subject: [PATCH] first commit --- .../AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td | 25 +++++++ .../Transforms/AMDAIEControlCodeLowering.cpp | 65 ++++++++++++++++--- .../AMDAIEControlCodeToTransaction.cpp | 21 ++---- .../Transforms/test/controlcode_lowering.mlir | 10 +-- .../test/controlcode_to_transaction.mlir | 2 +- 5 files changed, 92 insertions(+), 31 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td index 371945da7..17d2d604d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td @@ -1022,6 +1022,31 @@ def AMDAIE_NpuWriteBdOp: AMDAIE_Op<"npu.write_bd"> { let assemblyFormat = [{ attr-dict }]; } +def AMDAIE_NpuTctSyncOp: AMDAIE_Op<"npu.tct_sync"> { + let summary = "Wait for the TCTs to be emitted."; + let description = [{ + This NPU controller operation to synchronize the Task Completion Tokens (TCTs) + on the specified `channel` and `direction`. The ranges of tiles to synchronize + are defined by [col, col+col_num) and [row, row+row_num). + + Example: + + ```mlir + amdaie.npu.tct_sync {col = 0 : ui32, row = 0 : ui32, channel = 0 : ui32, + direction = 1 : i32, col_num = 1 : ui32, row_num = 1 : ui32} + ``` + }]; + let arguments = ( + ins UI32Attr:$col, + UI32Attr:$row, + DMAChannelDir:$direction, + UI32Attr:$channel, + UI32Attr:$col_num, + UI32Attr:$row_num + ); + let assemblyFormat = [{ attr-dict }]; +} + //===----------------------------------------------------------------------===// // IREE AMDAIE LogicalObjectFifo Ops //===----------------------------------------------------------------------===// diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp index cf40de2b0..0c4d72300 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp @@ -236,6 +236,31 @@ struct HalfDmaCpyNdToNpuConverter final uint8_t minStrideBitWidth; }; +struct DmaWaitToTctSyncConverter final + : OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite( + AMDAIE::NpuDmaWaitOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + LLVM_DEBUG(llvm::dbgs() << "matchAndRewrite[AMDAIE::NpuDmaWaitOp]\n"); + + for (Value token : op.getAsyncTokens()) { + auto pushToQueueOp = + dyn_cast_if_present(token.getDefiningOp()); + if (!pushToQueueOp) { + return op.emitOpError() + << "should operate on an `amdaie.push_to_queue` op"; + } + rewriter.create( + op.getLoc(), pushToQueueOp.getCol(), pushToQueueOp.getRow(), + pushToQueueOp.getDirection(), pushToQueueOp.getChannel(), 1, 1); + } + rewriter.eraseOp(op); + return success(); + } +}; + namespace { class AMDAIEControlCodeLoweringPass : public impl::AMDAIEControlCodeLoweringBase< @@ -260,17 +285,37 @@ void AMDAIEControlCodeLoweringPass::runOnOperation() { "ops."; return signalPassFailure(); } - AMDAIE::AMDAIEDeviceModel deviceModel = - AMDAIE::getDeviceModel(maybeDevice.value()); - RewritePatternSet patterns(context); - ConversionTarget conversionTarget(*context); - conversionTarget.addLegalDialect(); - conversionTarget.addIllegalOp(); - patterns.insert(context, deviceModel); - if (failed(applyPartialConversion(parentOp, conversionTarget, - std::move(patterns)))) { - return signalPassFailure(); + // First conversion: HalfDmaCpyNdOp to WriteBdOp, AddressPatchOp and + // PushToQueueOp. + { + AMDAIE::AMDAIEDeviceModel deviceModel = + AMDAIE::getDeviceModel(maybeDevice.value()); + RewritePatternSet patterns(context); + ConversionTarget conversionTarget(*context); + conversionTarget.addLegalDialect(); + conversionTarget.addIllegalOp(); + patterns.insert(context, deviceModel); + + if (failed(applyPartialConversion(parentOp, conversionTarget, + std::move(patterns)))) { + return signalPassFailure(); + } + } + + // Second conversion: DmaWaitOp to TctSyncOp. + // The two conversions are separate to simplify the attribute handling, such + // as col, row, direction, channel, etc. + { + RewritePatternSet patterns(context); + ConversionTarget conversionTarget(*context); + conversionTarget.addLegalDialect(); + conversionTarget.addIllegalOp(); + patterns.insert(context); + if (failed(applyPartialConversion(parentOp, conversionTarget, + std::move(patterns)))) { + return signalPassFailure(); + } } } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp index 665ea08a8..421900d6a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp @@ -199,20 +199,11 @@ LogicalResult convertOp(AMDAIE::NpuAddressPatchOp op, return success(); } -LogicalResult convertOp(AMDAIE::NpuDmaWaitOp op, TransactionBuilder &builder) { - for (Value token : op.getAsyncTokens()) { - auto pushToQueueOp = - dyn_cast_if_present(token.getDefiningOp()); - if (!pushToQueueOp) { - return op.emitOpError() - << "should operate on an `amdaie.push_to_queue` op"; - } - if (failed(builder.appendTCTSync( - pushToQueueOp.getCol(), pushToQueueOp.getRow(), - static_cast(pushToQueueOp.getDirection()), 1, 1, - pushToQueueOp.getChannel()))) { - return failure(); - } +LogicalResult convertOp(AMDAIE::NpuTctSyncOp op, TransactionBuilder &builder) { + if (failed(builder.appendTCTSync( + op.getCol(), op.getRow(), static_cast(op.getDirection()), + op.getRowNum(), op.getColNum(), op.getChannel()))) { + return failure(); } return success(); } @@ -274,7 +265,7 @@ LogicalResult controlCodeToTransaction(IRRewriter &rewriter, WalkResult res = controlCodeOp->walk([&](Operation *op) { LogicalResult switchResult = TypeSwitch(op) - .Case( [&](auto npuOp) { if (failed(convertOp(npuOp, builder))) return failure(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir index 26bad8b3b..300dcade9 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir @@ -62,13 +62,13 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 2048 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} // CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} -// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) +// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32} %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%6 : !amdaie.async_token) // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} // CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32} -// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token) +// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32} %7 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%7 : !amdaie.async_token) amdaie.end @@ -113,13 +113,13 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} // CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} -// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) +// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32} %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%6 : !amdaie.async_token) // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 512 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 64 : ui32} // CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32} -// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token) +// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32} %7 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 32] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%7 : !amdaie.async_token) amdaie.end @@ -169,7 +169,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: amdaie.npu.write_bd {bd_id = 2 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 2 : ui32, col = 0 : ui32, offset = 0 : ui32} // CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32} -// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) +// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32} %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id_2 channel = %channel start_bd = %bd_id) : !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%6 : !amdaie.async_token) amdaie.end diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir index fa83b2028..d43736be6 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir @@ -143,7 +143,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.workgroup { amdaie.controlcode { %0 = amdaie.npu.push_to_queue async {bd_id = 15 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 1 : i32, repeat_count = 256 : ui32, row = 0 : ui32} - amdaie.npu.dma_wait(%0 : !amdaie.async_token) + amdaie.npu.tct_sync {channel = 0 : ui32, col = 2 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32} amdaie.end } }