From ce810db11acaaf50247b22ffb068b690fe87dcb6 Mon Sep 17 00:00:00 2001 From: Yu-Zhewen Date: Thu, 5 Dec 2024 13:40:14 +0000 Subject: [PATCH 1/5] init commit --- .../AMDAIEControlCodeToTransaction.cpp | 6 +- .../Transforms/AMDAIESimplifyDmaWaits.cpp | 158 ++++++++++++ .../iree-amd-aie/Transforms/CMakeLists.txt | 1 + .../iree-amd-aie/Transforms/PassDetail.h | 1 + .../iree-amd-aie/Transforms/Passes.cpp | 1 + .../AMD-AIE/iree-amd-aie/Transforms/Passes.h | 3 + .../AMD-AIE/iree-amd-aie/Transforms/Passes.td | 6 + .../Transforms/test/CMakeLists.txt | 1 + .../test/controlcode_to_transaction.mlir | 4 +- .../Transforms/test/simplify_dma_waits.mlir | 224 ++++++++++++++++++ .../aie_runtime/iree_aie_runtime.cc | 7 + .../aie_runtime/iree_aie_runtime.h | 2 + 12 files changed, 409 insertions(+), 5 deletions(-) create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESimplifyDmaWaits.cpp create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/simplify_dma_waits.mlir diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp index d92f23af9..35013fbcb 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp @@ -220,9 +220,9 @@ LogicalResult convertOp(AMDAIE::NpuDmaWaitOp op, TransactionBuilder &builder) { LogicalResult convertOp(AMDAIE::NpuPushToQueueOp op, TransactionBuilder &builder) { uint32_t repeatCount = op.getRepeatCount() - 1; - if (failed(builder.appendPushToQueueOp(op.getCol(), op.getRow(), - op.getDirection(), op.getChannel(), - op.getBdId(), repeatCount, true))) { + if (failed(builder.appendPushToQueueOp( + op.getCol(), op.getRow(), op.getDirection(), op.getChannel(), + op.getBdId(), repeatCount, static_cast(op.getAsyncToken())))) { return failure(); } return success(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESimplifyDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESimplifyDmaWaits.cpp new file mode 100644 index 000000000..462fdc8f9 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESimplifyDmaWaits.cpp @@ -0,0 +1,158 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h" +#include "iree-amd-aie/Transforms/AMDAIEUtils.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" +#include "mlir/IR/Iterators.h" +#define DEBUG_TYPE "iree-amdaie-simplify-dma-waits" + +namespace mlir::iree_compiler::AMDAIE { + +namespace { + +/// Traverses the control code in reverse, ensuring that for each connection, +/// only one DMA wait op is retained for every maximum queue size. +LogicalResult simplifyDmaWaits(AMDAIE::AMDAIEDeviceModel deviceModel, + AMDAIE::WorkgroupOp workgroupOp) { + IRRewriter rewriter(workgroupOp->getContext()); + std::vector waitOpsToErase; + DenseMap> connectionToBdIdQueues; + AMDAIE::ControlCodeOp controlCodeOp = workgroupOp.getControlCode(); + WalkResult res = controlCodeOp->walk( + [&](AMDAIE::NpuDmaWaitOp waitOp) { + bool toErase = true; + for (Value token : waitOp.getAsyncTokens()) { + if (auto npuHalfDmaCpyNdOp = + dyn_cast_if_present( + token.getDefiningOp())) { + // Retrieve the connection op. + std::optional maybeConnectionOp = + npuHalfDmaCpyNdOp.getConnectionOp(); + if (!maybeConnectionOp) { + npuHalfDmaCpyNdOp.emitOpError() + << "expected to operate on an `amdaie.connection`"; + return WalkResult::interrupt(); + } + AMDAIE::ConnectionOp connectionOp = maybeConnectionOp.value(); + // Retrieve the flow op. + std::optional maybeFlowOp = + maybeConnectionOp->getFlowOp(); + if (!maybeFlowOp) { + maybeConnectionOp->emitOpError() + << "expected to operate on an `amdaie.flow`"; + return WalkResult::interrupt(); + } + if (maybeFlowOp->getIsPacketFlow()) return WalkResult::advance(); + // Retrieve the BD ID op. + std::optional maybeBdIdOp = + npuHalfDmaCpyNdOp.getBdIdOp(); + if (!maybeBdIdOp) { + npuHalfDmaCpyNdOp.emitOpError() + << "must have a BD ID op to lower to " + "`amdaie.npu.write_bd`"; + return WalkResult::interrupt(); + } + AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value(); + // Retrieve the tile op. + AMDAIE::TileOp tileOp = dyn_cast_if_present( + bdIdOp.getTile().getDefiningOp()); + if (!tileOp) { + bdIdOp.emitOpError() << "must operate on an `amdaie.tile`"; + return WalkResult::interrupt(); + } + // Get the maximum queue size. + uint32_t col = getConstantIndexOrAssert(tileOp.getCol()); + uint32_t row = getConstantIndexOrAssert(tileOp.getRow()); + uint32_t maxQueueSize = deviceModel.getDmaMaxQueueSize(col, row); + // Keep wait op if reaches the maximum queue size or there is a + // duplicate BD ID. + uint32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue()); + auto &bdIdQueue = connectionToBdIdQueues[connectionOp]; + if (bdIdQueue.size() >= maxQueueSize) bdIdQueue.clear(); + if (bdIdQueue.empty() || llvm::is_contained(bdIdQueue, bdId)) { + toErase = false; + bdIdQueue = {bdId}; + } else { + bdIdQueue.push_back(bdId); + } + } + } + // Erase later to avoid invalidating the iterator. + if (toErase) waitOpsToErase.push_back(waitOp); + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return failure(); + + for (AMDAIE::NpuDmaWaitOp waitOp : waitOpsToErase) { + SmallVector asyncTokens(waitOp.getAsyncTokens()); + // Erase the wait op. + rewriter.eraseOp(waitOp); + for (Value token : asyncTokens) { + if (auto op = dyn_cast_if_present( + token.getDefiningOp())) { + if (op.use_empty()) { + rewriter.setInsertionPoint(op); + TypeRange resultTypeRange = TypeRange{}; + // Nullify the result to avoid issuing a token. + rewriter.create( + op.getLoc(), resultTypeRange, op.getConnection(), op.getInput(), + op.getMixedOffsets(), op.getMixedSizes(), op.getMixedStrides(), + op.getBdId(), op.getChannel()); + rewriter.eraseOp(op); + } + } + } + } + + return success(); +} + +class AMDAIESimplifyDmaWaitsPass + : public impl::AMDAIESimplifyDmaWaitsBase { + public: + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + AMDAIESimplifyDmaWaitsPass() = default; + AMDAIESimplifyDmaWaitsPass(const AMDAIESimplifyDmaWaitsPass &pass){}; + void runOnOperation() override; +}; + +void AMDAIESimplifyDmaWaitsPass::runOnOperation() { + Operation *parentOp = getOperation(); + + auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp); + std::optional maybeDevice = getConfigAMDAIEDevice(targetAttr); + if (!maybeDevice) { + parentOp->emitOpError() + << "has no AMDAIEDevice in the target attribute configuration. This " + "device-specific information is required to simplify DMA wait " + "ops."; + return signalPassFailure(); + } + AMDAIE::AMDAIEDeviceModel deviceModel = + AMDAIE::getDeviceModel(maybeDevice.value()); + + WalkResult res = parentOp->walk([&](AMDAIE::WorkgroupOp workgroupOp) { + if (failed(simplifyDmaWaits(deviceModel, workgroupOp))) { + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return signalPassFailure(); +} + +} // namespace + +std::unique_ptr createAMDAIESimplifyDmaWaitsPass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE \ No newline at end of file diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index 39e496baa..f545c33d4 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -102,6 +102,7 @@ iree_cc_library( "AMDAIEPeelForLoop.cpp" "AMDAIEPropagateDataLayout.cpp" "AMDAIERemoveMemorySpace.cpp" + "AMDAIESimplifyDmaWaits.cpp" "AMDAIESinkIntoCore.cpp" "AMDAIESplitLogicalObjFifos.cpp" "AMDAIESplitLogicalObjFifosForConnectionReuse.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index f921f6d47..172658802 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -81,6 +81,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIEPEELFORLOOP #define GEN_PASS_DEF_AMDAIEPROPAGATEDATALAYOUT #define GEN_PASS_DEF_AMDAIEREMOVEMEMORYSPACE +#define GEN_PASS_DEF_AMDAIESIMPLIFYDMAWAITS #define GEN_PASS_DEF_AMDAIESINKINTOCORE #define GEN_PASS_DEF_AMDAIESPLITLOGICALOBJFIFOS #define GEN_PASS_DEF_AMDAIESPLITLOGICALOBJFIFOSFORCONNECTIONREUSE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index becd1333b..932833632 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -665,6 +665,7 @@ void addAMDAIEObjectFifoLoweringPasses( passManager.addPass(createAMDAIEAssignPacketIdsPass()); passManager.addPass(createAMDAIENpuDmaToHalfDmaCpyNdPass()); + passManager.addPass(createAMDAIESimplifyDmaWaitsPass()); passManager.addPass(createAMDAIEControlCodeLoweringPass()); passManager.addPass(createAMDAIEControlCodeToTransactionPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index 705cecedd..4b1614f90 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -279,6 +279,9 @@ std::unique_ptr createAMDAIEPeelForLoopPass( /// Create a pass to remove memory space annotation from all types. std::unique_ptr createAMDAIERemoveMemorySpacePass(); +/// Create a pass to remove redundant DMA wait operations. +std::unique_ptr createAMDAIESimplifyDmaWaitsPass(); + /// Create a pass to sink all dependencies into `amdaie.core` operations. std::unique_ptr createAMDAIESinkIntoCorePass(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index cbf2e2279..ee419500c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -663,6 +663,12 @@ def AMDAIERemoveMemorySpace : Pass<"iree-amdaie-remove-memoryspace"> { let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIERemoveMemorySpacePass()"; } +def AMDAIESimplifyDmaWaits : + Pass<"iree-amdaie-simplify-dma-waits", ""> { + let summary = "Remove redundant dma wait operations in controlcode."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESimplifyDmaWaitsPass()"; +} + def AMDAIESinkIntoCore : Pass<"iree-amdaie-sink-into-core", "ModuleOp"> { let summary = "Clone constants and other ops into amdaie.cores"; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index 2f8d8633f..c08a82daf 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -74,6 +74,7 @@ iree_lit_test_suite( "peel_for_loop.mlir" "propagate_data_layout.mlir" "remove_memory_space.mlir" + "simplify_dma_waits.mlir" "sink_into_core.mlir" "split_logicalobjfifos.mlir" "split_logicalobjfifos_for_connection_reuse.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir index f7704d4db..fa83b2028 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir @@ -75,7 +75,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: 0x00000000 // CHECK: 0x0001D214 // CHECK: 0x00000000 -// CHECK: 0x80000000 +// CHECK: 0x00000000 // CHECK: 0x00000018 // CHECK-LABEL: @push_to_queue_default_values // CHECK: npu_instructions = dense_resource : tensor<10xui32> @@ -102,7 +102,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: 0x00000000 // CHECK: 0x0601D21C // CHECK: 0x00000000 -// CHECK: 0x803F0002 +// CHECK: 0x003F0002 // CHECK: 0x00000018 // CHECK-LABEL: @push_to_queue // CHECK: npu_instructions = dense_resource : tensor<10xui32> diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/simplify_dma_waits.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/simplify_dma_waits.mlir new file mode 100644 index 000000000..80d631653 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/simplify_dma_waits.mlir @@ -0,0 +1,224 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-simplify-dma-waits)" --split-input-file --verify-diagnostics %s | FileCheck %s + +// expected-error @+1 {{op has no AMDAIEDevice in the target attribute configuration}} +module { + func.func @no_amdaie_device() { + amdaie.workgroup { + amdaie.controlcode { + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK-LABEL: @no_ops +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @no_ops() { + amdaie.workgroup { + amdaie.controlcode { + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK-LABEL: @simplify_dma_waits_same_bd_id +// CHECK: %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_buffers +// CHECK: %[[CHANNEL_0:.+]] = amdaie.channel +// CHECK: %[[CHANNEL_1:.+]] = amdaie.channel +// CHECK: %[[CONNECTION:.+]] = amdaie.connection +// CHECK: %[[OBJECT_FIFO_1:.+]] = amdaie.logicalobjectfifo.from_memref +// CHECK: %[[BD_ID:.+]] = amdaie.bd_id +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) +// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token) +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @simplify_dma_waits_same_bd_id() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + amdaie.workgroup { + %tile = amdaie.tile(%c0, %c1) + %tile_0 = amdaie.tile(%c0, %c0) + %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %lock = amdaie.lock(%tile(4), 4) + %lock_2 = amdaie.lock(%tile(5), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S) + %channel_3 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM) + %3 = amdaie.flow({%channel} -> {%channel_3}) {is_packet_flow = false} + %4 = amdaie.connection(%0 {%channel_3}, %2 {%channel}, flow = %3) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %1, 64 : memref<64x32xi32> + %bd_id = amdaie.bd_id(%tile_0, %c0) + %6 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%6 : !amdaie.async_token) + %7 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%7 : !amdaie.async_token) + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK-LABEL: @simplify_dma_waits_max_queue_size +// CHECK: %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_buffers +// CHECK: %[[CHANNEL_0:.+]] = amdaie.channel +// CHECK: %[[CHANNEL_1:.+]] = amdaie.channel +// CHECK: %[[CONNECTION:.+]] = amdaie.connection +// CHECK: %[[OBJECT_FIFO_1:.+]] = amdaie.logicalobjectfifo.from_memref +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) +// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo> +// CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID_2]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo> +// CHECK: %[[BD_ID_3:.+]] = amdaie.bd_id +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID_3]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo> +// CHECK: %[[BD_ID_4:.+]] = amdaie.bd_id +// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID_4]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token) +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @simplify_dma_waits_max_queue_size() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %c4 = arith.constant 4 : index + amdaie.workgroup { + %tile = amdaie.tile(%c0, %c1) + %tile_0 = amdaie.tile(%c0, %c0) + %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %lock = amdaie.lock(%tile(4), 4) + %lock_2 = amdaie.lock(%tile(5), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S) + %channel_3 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM) + %3 = amdaie.flow({%channel} -> {%channel_3}) {is_packet_flow = false} + %4 = amdaie.connection(%0 {%channel_3}, %2 {%channel}, flow = %3) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %1, 64 : memref<64x32xi32> + %bd_id = amdaie.bd_id(%tile_0, %c0) + %6 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%6 : !amdaie.async_token) + %bd_id_1 = amdaie.bd_id(%tile_0, %c1) + %7 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_1 channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%7 : !amdaie.async_token) + %bd_id_2 = amdaie.bd_id(%tile_0, %c2) + %8 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_2 channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%8 : !amdaie.async_token) + %bd_id_3 = amdaie.bd_id(%tile_0, %c3) + %9 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_3 channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%9 : !amdaie.async_token) + %bd_id_4 = amdaie.bd_id(%tile_0, %c4) + %10 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_4 channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%10 : !amdaie.async_token) + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK-LABEL: @simplify_dma_waits_two_connections +// CHECK: %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_buffers +// CHECK: %[[OBJECT_FIFO_1:.+]] = amdaie.logicalobjectfifo.from_buffers +// CHECK: %[[CHANNEL_0:.+]] = amdaie.channel +// CHECK: %[[CHANNEL_1:.+]] = amdaie.channel +// CHECK: %[[CHANNEL_2:.+]] = amdaie.channel +// CHECK: %[[CHANNEL_3:.+]] = amdaie.channel +// CHECK: %[[CONNECTION_0:.+]] = amdaie.connection +// CHECK: %[[CONNECTION_1:.+]] = amdaie.connection +// CHECK: %[[OBJECT_FIFO_2:.+]] = amdaie.logicalobjectfifo.from_memref +// CHECK: %[[OBJECT_FIFO_3:.+]] = amdaie.logicalobjectfifo.from_memref +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[OBJECT_FIFO_2]] [] [] [] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo> +// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_1]](%[[OBJECT_FIFO_3]] [] [] [] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL_2]]) : !amdaie.logicalobjectfifo> +// CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[OBJECT_FIFO_2]] [] [] [] bd_id = %[[BD_ID_2]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) +// CHECK: %[[BD_ID_3:.+]] = amdaie.bd_id +// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_1]](%[[OBJECT_FIFO_3]] [] [] [] bd_id = %[[BD_ID_3]] channel = %[[CHANNEL_2]]) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token) +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @simplify_dma_waits_two_connections() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + amdaie.workgroup { + %tile = amdaie.tile(%c0, %c1) + %tile_0 = amdaie.tile(%c0, %c0) + %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %buffer_2 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %buffer_3 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %lock = amdaie.lock(%tile(4), 4) + %lock_4 = amdaie.lock(%tile(5), 0) + %lock_5 = amdaie.lock(%tile(6), 4) + %lock_6 = amdaie.lock(%tile(7), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_4}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_2, %buffer_3}, {%lock_5}, {%lock_6}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %5 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S) + %channel_7 = amdaie.channel(%tile_0, 1, port_type = DMA, direction = MM2S) + %channel_8 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM) + %channel_9 = amdaie.channel(%tile, 1, port_type = DMA, direction = S2MM) + %6 = amdaie.flow({%channel} -> {%channel_7}) {is_packet_flow = false} + %7 = amdaie.flow({%channel_8} -> {%channel_9}) {is_packet_flow = false} + %8 = amdaie.connection(%0 {%channel_7}, %2 {%channel}, flow = %6) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %9 = amdaie.connection(%3 {%channel_9}, %5 {%channel_8}, flow = %7) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %10 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %1, 64 : memref<64x32xi32> + %11 = amdaie.logicalobjectfifo.from_memref %4, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %4, 64 : memref<64x32xi32> + %bd_id = amdaie.bd_id(%tile_0, %c0) + %12 = amdaie.npu.half_dma_cpy_nd async %8(%10 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%12 : !amdaie.async_token) + %bd_id_1 = amdaie.bd_id(%tile_0, %c1) + %13 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_1 channel = %channel_8) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%13 : !amdaie.async_token) + %bd_id_2 = amdaie.bd_id(%tile_0, %c2) + %14 = amdaie.npu.half_dma_cpy_nd async %8(%10 [] [] [] bd_id = %bd_id_2 channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%14 : !amdaie.async_token) + %bd_id_3 = amdaie.bd_id(%tile_0, %c3) + %15 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_3 channel = %channel_8) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%15 : !amdaie.async_token) + amdaie.end + } + } + return + } +} diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc index a29fe0898..dd7badbcb 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc @@ -495,6 +495,13 @@ AMDAIEDeviceModel::getChannelToValidBdIds(AMDAIETileType tileType) const { llvm::report_fatal_error("Unhandled AMDAIETileType case"); } +uint8_t AMDAIEDeviceModel::getDmaMaxQueueSize(uint8_t col, uint8_t row) { + uint8_t maxQueueSize = 0; + TRY_XAIE_API_FATAL_ERROR(XAie_DmaGetMaxQueueSize, &devInst, + XAie_TileLoc(col, row), &maxQueueSize); + return maxQueueSize; +} + struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { switch (device) { case AMDAIEDevice::xcvc1902: { diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h index 144e1dc62..81cafa118 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h @@ -375,6 +375,8 @@ struct AMDAIEDeviceModel { DenseMap> getChannelToValidBdIds( AMDAIETileType tileType) const; + uint8_t getDmaMaxQueueSize(uint8_t col, uint8_t row); + AMDAIEDevice device; // mlir-air legacy From d59184e7877cca5ecff74d0eac06d930a2c9c80a Mon Sep 17 00:00:00 2001 From: Yu-Zhewen Date: Thu, 5 Dec 2024 17:28:58 +0000 Subject: [PATCH 2/5] bugfix and resolve comments --- ...ifyDmaWaits.cpp => AMDAIEFoldDmaWaits.cpp} | 58 ++++++++++--------- .../iree-amd-aie/Transforms/CMakeLists.txt | 2 +- .../iree-amd-aie/Transforms/PassDetail.h | 2 +- .../iree-amd-aie/Transforms/Passes.cpp | 2 +- .../AMD-AIE/iree-amd-aie/Transforms/Passes.h | 6 +- .../AMD-AIE/iree-amd-aie/Transforms/Passes.td | 12 ++-- .../Transforms/test/CMakeLists.txt | 2 +- ...ify_dma_waits.mlir => fold_dma_waits.mlir} | 14 ++--- 8 files changed, 52 insertions(+), 46 deletions(-) rename compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/{AMDAIESimplifyDmaWaits.cpp => AMDAIEFoldDmaWaits.cpp} (74%) rename compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/{simplify_dma_waits.mlir => fold_dma_waits.mlir} (97%) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESimplifyDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp similarity index 74% rename from compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESimplifyDmaWaits.cpp rename to compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp index 462fdc8f9..c5705f0d9 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESimplifyDmaWaits.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp @@ -10,7 +10,7 @@ #include "iree-amd-aie/Transforms/Passes.h" #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" #include "mlir/IR/Iterators.h" -#define DEBUG_TYPE "iree-amdaie-simplify-dma-waits" +#define DEBUG_TYPE "iree-amdaie-fold-dma-waits" namespace mlir::iree_compiler::AMDAIE { @@ -18,12 +18,13 @@ namespace { /// Traverses the control code in reverse, ensuring that for each connection, /// only one DMA wait op is retained for every maximum queue size. -LogicalResult simplifyDmaWaits(AMDAIE::AMDAIEDeviceModel deviceModel, - AMDAIE::WorkgroupOp workgroupOp) { - IRRewriter rewriter(workgroupOp->getContext()); +LogicalResult foldDmaWaits(AMDAIE::AMDAIEDeviceModel deviceModel, + AMDAIE::ControlCodeOp controlCodeOp) { + IRRewriter rewriter(controlCodeOp->getContext()); std::vector waitOpsToErase; - DenseMap> connectionToBdIdQueues; - AMDAIE::ControlCodeOp controlCodeOp = workgroupOp.getControlCode(); + DenseMap, + SmallVector> + tileConnectionToBdIdQueueMap; WalkResult res = controlCodeOp->walk( [&](AMDAIE::NpuDmaWaitOp waitOp) { bool toErase = true; @@ -70,17 +71,21 @@ LogicalResult simplifyDmaWaits(AMDAIE::AMDAIEDeviceModel deviceModel, uint32_t col = getConstantIndexOrAssert(tileOp.getCol()); uint32_t row = getConstantIndexOrAssert(tileOp.getRow()); uint32_t maxQueueSize = deviceModel.getDmaMaxQueueSize(col, row); - // Keep wait op if reaches the maximum queue size or there is a - // duplicate BD ID. + // Keep wait op if, either reaches the maximum queue size, or there + // is a duplicate BD ID in the same tile. uint32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue()); - auto &bdIdQueue = connectionToBdIdQueues[connectionOp]; - if (bdIdQueue.size() >= maxQueueSize) bdIdQueue.clear(); - if (bdIdQueue.empty() || llvm::is_contained(bdIdQueue, bdId)) { - toErase = false; - bdIdQueue = {bdId}; - } else { - bdIdQueue.push_back(bdId); - } + bool isDuplicateBdId = llvm::any_of( + tileConnectionToBdIdQueueMap, [&](const auto &entry) { + return entry.first.first == tileOp && + llvm::is_contained(entry.second, bdId); + }); + SmallVector &bdIdQueue = + tileConnectionToBdIdQueueMap[std::make_pair(tileOp, + connectionOp)]; + if (bdIdQueue.size() >= maxQueueSize || isDuplicateBdId) + bdIdQueue.clear(); + if (bdIdQueue.empty()) toErase = false; + bdIdQueue.push_back(bdId); } } // Erase later to avoid invalidating the iterator. @@ -113,19 +118,19 @@ LogicalResult simplifyDmaWaits(AMDAIE::AMDAIEDeviceModel deviceModel, return success(); } -class AMDAIESimplifyDmaWaitsPass - : public impl::AMDAIESimplifyDmaWaitsBase { +class AMDAIEFoldDmaWaitsPass + : public impl::AMDAIEFoldDmaWaitsBase { public: void getDependentDialects(DialectRegistry ®istry) const override { registry.insert(); } - AMDAIESimplifyDmaWaitsPass() = default; - AMDAIESimplifyDmaWaitsPass(const AMDAIESimplifyDmaWaitsPass &pass){}; + AMDAIEFoldDmaWaitsPass() = default; + AMDAIEFoldDmaWaitsPass(const AMDAIEFoldDmaWaitsPass &pass){}; void runOnOperation() override; }; -void AMDAIESimplifyDmaWaitsPass::runOnOperation() { +void AMDAIEFoldDmaWaitsPass::runOnOperation() { Operation *parentOp = getOperation(); auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp); @@ -133,7 +138,7 @@ void AMDAIESimplifyDmaWaitsPass::runOnOperation() { if (!maybeDevice) { parentOp->emitOpError() << "has no AMDAIEDevice in the target attribute configuration. This " - "device-specific information is required to simplify DMA wait " + "device-specific information is required to fold DMA wait " "ops."; return signalPassFailure(); } @@ -141,7 +146,8 @@ void AMDAIESimplifyDmaWaitsPass::runOnOperation() { AMDAIE::getDeviceModel(maybeDevice.value()); WalkResult res = parentOp->walk([&](AMDAIE::WorkgroupOp workgroupOp) { - if (failed(simplifyDmaWaits(deviceModel, workgroupOp))) { + AMDAIE::ControlCodeOp controlCodeOp = workgroupOp.getControlCode(); + if (failed(foldDmaWaits(deviceModel, controlCodeOp))) { return WalkResult::interrupt(); } return WalkResult::advance(); @@ -151,8 +157,8 @@ void AMDAIESimplifyDmaWaitsPass::runOnOperation() { } // namespace -std::unique_ptr createAMDAIESimplifyDmaWaitsPass() { - return std::make_unique(); +std::unique_ptr createAMDAIEFoldDmaWaitsPass() { + return std::make_unique(); } -} // namespace mlir::iree_compiler::AMDAIE \ No newline at end of file +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index f545c33d4..98fe292fd 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -75,6 +75,7 @@ iree_cc_library( "AMDAIEDmaUtils.cpp" "AMDAIEFlattenLogicalObjectFifo.cpp" "AMDAIELinalgFunctionOutlining.cpp" + "AMDAIEFoldDmaWaits.cpp" "AMDAIEFuseConsumerIntoLoop.cpp" "AMDAIEFuseFillIntoForall.cpp" "AMDAIEFusePackIntoLoop.cpp" @@ -102,7 +103,6 @@ iree_cc_library( "AMDAIEPeelForLoop.cpp" "AMDAIEPropagateDataLayout.cpp" "AMDAIERemoveMemorySpace.cpp" - "AMDAIESimplifyDmaWaits.cpp" "AMDAIESinkIntoCore.cpp" "AMDAIESplitLogicalObjFifos.cpp" "AMDAIESplitLogicalObjFifosForConnectionReuse.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 172658802..6cdf14d1b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -52,6 +52,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIEDMATOCIRCULARDMA #define GEN_PASS_DEF_AMDAIEFLATTENLOGICALOBJECTFIFO #define GEN_PASS_DEF_AMDAIELINALGFUNCTIONOUTLINING +#define GEN_PASS_DEF_AMDAIEFOLDDMAWAITS #define GEN_PASS_DEF_AMDAIEFUSECONSUMERINTOLOOP #define GEN_PASS_DEF_AMDAIEFUSEFILLINTOFORALL #define GEN_PASS_DEF_AMDAIEFUSEPACKINTOLOOP @@ -81,7 +82,6 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIEPEELFORLOOP #define GEN_PASS_DEF_AMDAIEPROPAGATEDATALAYOUT #define GEN_PASS_DEF_AMDAIEREMOVEMEMORYSPACE -#define GEN_PASS_DEF_AMDAIESIMPLIFYDMAWAITS #define GEN_PASS_DEF_AMDAIESINKINTOCORE #define GEN_PASS_DEF_AMDAIESPLITLOGICALOBJFIFOS #define GEN_PASS_DEF_AMDAIESPLITLOGICALOBJFIFOSFORCONNECTIONREUSE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 932833632..c8964fc90 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -665,7 +665,7 @@ void addAMDAIEObjectFifoLoweringPasses( passManager.addPass(createAMDAIEAssignPacketIdsPass()); passManager.addPass(createAMDAIENpuDmaToHalfDmaCpyNdPass()); - passManager.addPass(createAMDAIESimplifyDmaWaitsPass()); + passManager.addPass(createAMDAIEFoldDmaWaitsPass()); passManager.addPass(createAMDAIEControlCodeLoweringPass()); passManager.addPass(createAMDAIEControlCodeToTransactionPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index 4b1614f90..6609be0b6 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -204,6 +204,9 @@ std::unique_ptr createAMDAIEHoistLogicalObjFifoPass(); std::unique_ptr createAMDAIEInsertLoopsForVectorizationPass( AMDAIEInsertLoopsForVectorizationOptions options = {}); +/// Create a pass to remove redundant DMA wait operations. +std::unique_ptr createAMDAIEFoldDmaWaitsPass(); + /// Create a pass to fuse the pack operations into the for loops. std::unique_ptr createAMDAIEFusePackIntoLoopPass( AMDAIEFusePackIntoLoopOptions options = {}); @@ -279,9 +282,6 @@ std::unique_ptr createAMDAIEPeelForLoopPass( /// Create a pass to remove memory space annotation from all types. std::unique_ptr createAMDAIERemoveMemorySpacePass(); -/// Create a pass to remove redundant DMA wait operations. -std::unique_ptr createAMDAIESimplifyDmaWaitsPass(); - /// Create a pass to sink all dependencies into `amdaie.core` operations. std::unique_ptr createAMDAIESinkIntoCorePass(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index ee419500c..286aece65 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -284,6 +284,12 @@ def AMDAIELinalgFunctionOutlining : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIELinalgFunctionOutliningPass()"; } +def AMDAIEFoldDmaWaits : + Pass<"iree-amdaie-fold-dma-waits", ""> { + let summary = "Remove redundant dma wait operations in controlcode."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEFoldDmaWaitsPass()"; +} + def AMDAIEFuseConsumerIntoLoop : InterfacePass<"iree-amdaie-fuse-consumer-into-loop", "mlir::FunctionOpInterface"> { let summary = "Fuse the consumer operation into the innermost last scf loop."; @@ -663,12 +669,6 @@ def AMDAIERemoveMemorySpace : Pass<"iree-amdaie-remove-memoryspace"> { let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIERemoveMemorySpacePass()"; } -def AMDAIESimplifyDmaWaits : - Pass<"iree-amdaie-simplify-dma-waits", ""> { - let summary = "Remove redundant dma wait operations in controlcode."; - let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESimplifyDmaWaitsPass()"; -} - def AMDAIESinkIntoCore : Pass<"iree-amdaie-sink-into-core", "ModuleOp"> { let summary = "Clone constants and other ops into amdaie.cores"; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index c08a82daf..702217ede 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -40,6 +40,7 @@ iree_lit_test_suite( "dma_loop_subsumption_circular.mlir" "dma_loop_subsumption.mlir" "dma_to_circular_dma.mlir" + "fold_dma_waits.mlir" "flatten_logical_objectfifo.mlir" "linalg_function_outlining.mlir" "fuse_consumer_into_loop.mlir" @@ -74,7 +75,6 @@ iree_lit_test_suite( "peel_for_loop.mlir" "propagate_data_layout.mlir" "remove_memory_space.mlir" - "simplify_dma_waits.mlir" "sink_into_core.mlir" "split_logicalobjfifos.mlir" "split_logicalobjfifos_for_connection_reuse.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/simplify_dma_waits.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir similarity index 97% rename from compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/simplify_dma_waits.mlir rename to compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir index 80d631653..32cf42d4b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/simplify_dma_waits.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir @@ -1,4 +1,4 @@ -// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-simplify-dma-waits)" --split-input-file --verify-diagnostics %s | FileCheck %s +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-fold-dma-waits)" --split-input-file --verify-diagnostics %s | FileCheck %s // expected-error @+1 {{op has no AMDAIEDevice in the target attribute configuration}} module { @@ -29,7 +29,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -// CHECK-LABEL: @simplify_dma_waits_same_bd_id +// CHECK-LABEL: @fold_dma_waits_same_bd_id // CHECK: %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_buffers // CHECK: %[[CHANNEL_0:.+]] = amdaie.channel // CHECK: %[[CHANNEL_1:.+]] = amdaie.channel @@ -43,7 +43,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @simplify_dma_waits_same_bd_id() { + func.func @fold_dma_waits_same_bd_id() { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index amdaie.workgroup { @@ -77,7 +77,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -// CHECK-LABEL: @simplify_dma_waits_max_queue_size +// CHECK-LABEL: @fold_dma_waits_max_queue_size // CHECK: %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_buffers // CHECK: %[[CHANNEL_0:.+]] = amdaie.channel // CHECK: %[[CHANNEL_1:.+]] = amdaie.channel @@ -98,7 +98,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @simplify_dma_waits_max_queue_size() { + func.func @fold_dma_waits_max_queue_size() { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index @@ -145,7 +145,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -// CHECK-LABEL: @simplify_dma_waits_two_connections +// CHECK-LABEL: @fold_dma_waits_two_connections // CHECK: %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_buffers // CHECK: %[[OBJECT_FIFO_1:.+]] = amdaie.logicalobjectfifo.from_buffers // CHECK: %[[CHANNEL_0:.+]] = amdaie.channel @@ -169,7 +169,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @simplify_dma_waits_two_connections() { + func.func @fold_dma_waits_two_connections() { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index From 5f247c5ea4075fb73a73557c60510023fac75ec8 Mon Sep 17 00:00:00 2001 From: Yu-Zhewen Date: Fri, 6 Dec 2024 10:25:45 +0000 Subject: [PATCH 3/5] resolve comments --- .../Transforms/AMDAIEFoldDmaWaits.cpp | 9 ++++++--- .../Transforms/test/fold_dma_waits.mlir | 6 ++++++ .../iree-amd-aie/aie_runtime/iree_aie_runtime.cc | 15 ++++++++------- .../iree-amd-aie/aie_runtime/iree_aie_runtime.h | 4 ++-- 4 files changed, 22 insertions(+), 12 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp index c5705f0d9..ce0536cc7 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp @@ -18,13 +18,14 @@ namespace { /// Traverses the control code in reverse, ensuring that for each connection, /// only one DMA wait op is retained for every maximum queue size. -LogicalResult foldDmaWaits(AMDAIE::AMDAIEDeviceModel deviceModel, +LogicalResult foldDmaWaits(const AMDAIE::AMDAIEDeviceModel &deviceModel, AMDAIE::ControlCodeOp controlCodeOp) { IRRewriter rewriter(controlCodeOp->getContext()); std::vector waitOpsToErase; DenseMap, SmallVector> tileConnectionToBdIdQueueMap; + // Traverse the control code in reverse. WalkResult res = controlCodeOp->walk( [&](AMDAIE::NpuDmaWaitOp waitOp) { bool toErase = true; @@ -49,7 +50,7 @@ LogicalResult foldDmaWaits(AMDAIE::AMDAIEDeviceModel deviceModel, << "expected to operate on an `amdaie.flow`"; return WalkResult::interrupt(); } - if (maybeFlowOp->getIsPacketFlow()) return WalkResult::advance(); + bool isPacketFlow = maybeFlowOp->getIsPacketFlow(); // Retrieve the BD ID op. std::optional maybeBdIdOp = npuHalfDmaCpyNdOp.getBdIdOp(); @@ -82,8 +83,10 @@ LogicalResult foldDmaWaits(AMDAIE::AMDAIEDeviceModel deviceModel, SmallVector &bdIdQueue = tileConnectionToBdIdQueueMap[std::make_pair(tileOp, connectionOp)]; - if (bdIdQueue.size() >= maxQueueSize || isDuplicateBdId) + if (isDuplicateBdId || isPacketFlow || + bdIdQueue.size() >= maxQueueSize) { bdIdQueue.clear(); + } if (bdIdQueue.empty()) toErase = false; bdIdQueue.push_back(bdId); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir index 32cf42d4b..a3f74d20e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir @@ -29,6 +29,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- +// Expect no DMA waits to be folded, since the same BD ID is used. // CHECK-LABEL: @fold_dma_waits_same_bd_id // CHECK: %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_buffers // CHECK: %[[CHANNEL_0:.+]] = amdaie.channel @@ -77,6 +78,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- +// DMA queue has a maximum size of 4. To optimize, starting from +// the end of the control code, retain every 4th DMA wait operation +// while folding the others. // CHECK-LABEL: @fold_dma_waits_max_queue_size // CHECK: %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_buffers // CHECK: %[[CHANNEL_0:.+]] = amdaie.channel @@ -145,6 +149,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- +// Two circuit connections are used, corresponding to two separate channels. +// Each channel operates with its own independent queue. // CHECK-LABEL: @fold_dma_waits_two_connections // CHECK: %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_buffers // CHECK: %[[OBJECT_FIFO_1:.+]] = amdaie.logicalobjectfifo.from_buffers diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc index dd7badbcb..f2ab3d11a 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc @@ -225,6 +225,14 @@ bool AMDAIEDeviceModel::isShimTile(uint8_t col, uint8_t row) const { return row == configPtr.ShimRowNum; } +uint8_t AMDAIEDeviceModel::getDmaMaxQueueSize(uint8_t col, uint8_t row) const { + uint8_t maxQueueSize = 0; + TRY_XAIE_API_FATAL_ERROR(XAie_DmaGetMaxQueueSize, + const_cast(&devInst), + XAie_TileLoc(col, row), &maxQueueSize); + return maxQueueSize; +} + // TODO(max): these should be optionals instead of returning 0. uint32_t AMDAIEDeviceModel::getNumLocks(uint8_t col, uint8_t row) const { AMDAIETileType tileType = getTileType(col, row); @@ -495,13 +503,6 @@ AMDAIEDeviceModel::getChannelToValidBdIds(AMDAIETileType tileType) const { llvm::report_fatal_error("Unhandled AMDAIETileType case"); } -uint8_t AMDAIEDeviceModel::getDmaMaxQueueSize(uint8_t col, uint8_t row) { - uint8_t maxQueueSize = 0; - TRY_XAIE_API_FATAL_ERROR(XAie_DmaGetMaxQueueSize, &devInst, - XAie_TileLoc(col, row), &maxQueueSize); - return maxQueueSize; -} - struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { switch (device) { case AMDAIEDevice::xcvc1902: { diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h index 81cafa118..e365b554e 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h @@ -312,6 +312,8 @@ struct AMDAIEDeviceModel { return *((const T *)(dmaBdMod + static_cast(dmaBdProp))); } + uint8_t getDmaMaxQueueSize(uint8_t col, uint8_t row) const; + uint32_t getNumLocks(uint8_t col, uint8_t row) const; std::optional getMemWest(TileLoc src) const; @@ -375,8 +377,6 @@ struct AMDAIEDeviceModel { DenseMap> getChannelToValidBdIds( AMDAIETileType tileType) const; - uint8_t getDmaMaxQueueSize(uint8_t col, uint8_t row); - AMDAIEDevice device; // mlir-air legacy From f820d0831cb5a50e6747f5878e685a0e6ea09fcb Mon Sep 17 00:00:00 2001 From: Yu-Zhewen Date: Fri, 6 Dec 2024 11:43:26 +0000 Subject: [PATCH 4/5] fix merge --- .../AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp index ce0536cc7..775a7b9a9 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp @@ -5,9 +5,9 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "iree-amd-aie/IR/AMDAIEOps.h" -#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h" -#include "iree-amd-aie/Transforms/AMDAIEUtils.h" #include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h" +#include "iree-amd-aie/Transforms/Utils/AMDAIEUtils.h" #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" #include "mlir/IR/Iterators.h" #define DEBUG_TYPE "iree-amdaie-fold-dma-waits" From 6fedb1dddd3c9cd13eda0ab4fcfb96b34a5d66b6 Mon Sep 17 00:00:00 2001 From: Yu-Zhewen Date: Mon, 9 Dec 2024 20:24:37 +0000 Subject: [PATCH 5/5] resolve comments --- .../Transforms/AMDAIEFoldDmaWaits.cpp | 145 +++++++++++------- .../Transforms/test/fold_dma_waits.mlir | 12 +- 2 files changed, 90 insertions(+), 67 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp index 775a7b9a9..670edeab4 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp @@ -16,15 +16,98 @@ namespace mlir::iree_compiler::AMDAIE { namespace { +/// Utility function to determine whether a DMA wait op can be folded based on +/// its half DMA copy operation. +FailureOr canFoldBasedOnHalfDmaCpy( + const AMDAIE::AMDAIEDeviceModel &deviceModel, + AMDAIE::NpuHalfDmaCpyNdOp &npuHalfDmaCpyNdOp, + DenseMap, + SmallVector> &tileConnectToBdIdQueue) { + // Retrieve the connection op. + std::optional maybeConnectionOp = + npuHalfDmaCpyNdOp.getConnectionOp(); + if (!maybeConnectionOp) { + return npuHalfDmaCpyNdOp.emitOpError() + << "expected to operate on an `amdaie.connection`"; + } + AMDAIE::ConnectionOp connectionOp = maybeConnectionOp.value(); + + // Retrieve the flow op. + std::optional maybeFlowOp = connectionOp.getFlowOp(); + if (!maybeFlowOp) { + return connectionOp->emitOpError() + << "expected to operate on an `amdaie.flow`"; + } + AMDAIE::FlowOp flowOp = maybeFlowOp.value(); + bool isPacketFlow = flowOp.getIsPacketFlow(); + + // Retrieve the BD ID op. + std::optional maybeBdIdOp = npuHalfDmaCpyNdOp.getBdIdOp(); + if (!maybeBdIdOp) { + return npuHalfDmaCpyNdOp.emitOpError() + << "must have a BD ID op to lower to " + "`amdaie.npu.write_bd`"; + } + AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value(); + + // Retrieve the tile op. + AMDAIE::TileOp tileOp = + dyn_cast_if_present(bdIdOp.getTile().getDefiningOp()); + if (!tileOp) { + return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`"; + } + + // Get the maximum queue size. + uint32_t col = getConstantIndexOrAssert(tileOp.getCol()); + uint32_t row = getConstantIndexOrAssert(tileOp.getRow()); + uint32_t maxQueueSize = deviceModel.getDmaMaxQueueSize(col, row); + + // Keep wait op if, either reaches the maximum queue size, or a + // duplicate BD ID in the same tile, or packet flow, or the queue is + // empty + uint32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue()); + bool isDuplicateBdId = + llvm::any_of(tileConnectToBdIdQueue, [&](const auto &entry) { + return entry.first.first == tileOp && + llvm::is_contained(entry.second, bdId); + }); + SmallVector &bdIdQueue = + tileConnectToBdIdQueue[{tileOp, connectionOp}]; + bool canFold = true; + if (isDuplicateBdId || isPacketFlow || bdIdQueue.size() >= maxQueueSize || + bdIdQueue.empty()) { + bdIdQueue.clear(); + canFold = false; + } + bdIdQueue.push_back(bdId); + return canFold; +} + /// Traverses the control code in reverse, ensuring that for each connection, /// only one DMA wait op is retained for every maximum queue size. +/// +/// Example Output: assuming a maximum queue size of 4. +/// dma_cpy_nd +/// %0 = dma_cpy_nd +/// dma_wait(%0) +/// dma_cpy_nd +/// dma_cpy_nd +/// dma_cpy_nd +/// %1 = dma_cpy_nd +/// dma_wait(%1) +/// From the bottom up, for every four DMA copy operations, only one DMA wait +/// operation is retained. +/// +/// Reverse traversal simplifies handling duplicate BD IDs, preventing +/// the need to revisit and modify earlier operations after processing later +/// ones. LogicalResult foldDmaWaits(const AMDAIE::AMDAIEDeviceModel &deviceModel, AMDAIE::ControlCodeOp controlCodeOp) { IRRewriter rewriter(controlCodeOp->getContext()); std::vector waitOpsToErase; DenseMap, SmallVector> - tileConnectionToBdIdQueueMap; + tileConnectToBdIdQueue; // Traverse the control code in reverse. WalkResult res = controlCodeOp->walk( [&](AMDAIE::NpuDmaWaitOp waitOp) { @@ -33,62 +116,10 @@ LogicalResult foldDmaWaits(const AMDAIE::AMDAIEDeviceModel &deviceModel, if (auto npuHalfDmaCpyNdOp = dyn_cast_if_present( token.getDefiningOp())) { - // Retrieve the connection op. - std::optional maybeConnectionOp = - npuHalfDmaCpyNdOp.getConnectionOp(); - if (!maybeConnectionOp) { - npuHalfDmaCpyNdOp.emitOpError() - << "expected to operate on an `amdaie.connection`"; - return WalkResult::interrupt(); - } - AMDAIE::ConnectionOp connectionOp = maybeConnectionOp.value(); - // Retrieve the flow op. - std::optional maybeFlowOp = - maybeConnectionOp->getFlowOp(); - if (!maybeFlowOp) { - maybeConnectionOp->emitOpError() - << "expected to operate on an `amdaie.flow`"; - return WalkResult::interrupt(); - } - bool isPacketFlow = maybeFlowOp->getIsPacketFlow(); - // Retrieve the BD ID op. - std::optional maybeBdIdOp = - npuHalfDmaCpyNdOp.getBdIdOp(); - if (!maybeBdIdOp) { - npuHalfDmaCpyNdOp.emitOpError() - << "must have a BD ID op to lower to " - "`amdaie.npu.write_bd`"; - return WalkResult::interrupt(); - } - AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value(); - // Retrieve the tile op. - AMDAIE::TileOp tileOp = dyn_cast_if_present( - bdIdOp.getTile().getDefiningOp()); - if (!tileOp) { - bdIdOp.emitOpError() << "must operate on an `amdaie.tile`"; - return WalkResult::interrupt(); - } - // Get the maximum queue size. - uint32_t col = getConstantIndexOrAssert(tileOp.getCol()); - uint32_t row = getConstantIndexOrAssert(tileOp.getRow()); - uint32_t maxQueueSize = deviceModel.getDmaMaxQueueSize(col, row); - // Keep wait op if, either reaches the maximum queue size, or there - // is a duplicate BD ID in the same tile. - uint32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue()); - bool isDuplicateBdId = llvm::any_of( - tileConnectionToBdIdQueueMap, [&](const auto &entry) { - return entry.first.first == tileOp && - llvm::is_contained(entry.second, bdId); - }); - SmallVector &bdIdQueue = - tileConnectionToBdIdQueueMap[std::make_pair(tileOp, - connectionOp)]; - if (isDuplicateBdId || isPacketFlow || - bdIdQueue.size() >= maxQueueSize) { - bdIdQueue.clear(); - } - if (bdIdQueue.empty()) toErase = false; - bdIdQueue.push_back(bdId); + FailureOr result = canFoldBasedOnHalfDmaCpy( + deviceModel, npuHalfDmaCpyNdOp, tileConnectToBdIdQueue); + if (failed(result)) return WalkResult::interrupt(); + toErase &= *result; } } // Erase later to avoid invalidating the iterator. diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir index a3f74d20e..4032221cc 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir @@ -31,16 +31,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // Expect no DMA waits to be folded, since the same BD ID is used. // CHECK-LABEL: @fold_dma_waits_same_bd_id -// CHECK: %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_buffers -// CHECK: %[[CHANNEL_0:.+]] = amdaie.channel -// CHECK: %[[CHANNEL_1:.+]] = amdaie.channel -// CHECK: %[[CONNECTION:.+]] = amdaie.connection -// CHECK: %[[OBJECT_FIFO_1:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK: %[[BD_ID:.+]] = amdaie.bd_id -// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo> -// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) -// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo> -// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token) +// CHECK-COUNT-2: amdaie.npu.dma_wait +// CHECK-NOT: amdaie.npu.dma_wait #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {