From 2243dd86d316568247e678c227fa41a249af03cb Mon Sep 17 00:00:00 2001 From: Zhewen Yu Date: Mon, 9 Dec 2024 22:53:25 +0000 Subject: [PATCH] Add a pass to fold DMA waits (#962) Each DMA channel has a task queue with the depth of 4. DMA wait is only required for every 4 pushes, reducing unnecessary synchronization. Example: https://gist.github.com/Yu-Zhewen/5f569b56c7b1f1a8715a7c4c3bf9e609 Results compared to 7c4b98571d86d7ec25daa20f1d0ac9e2be8a8c05: | Test (MxKxN) | Instruction Size Before (Words) | Instruction Size After (Words) | |---------------|---------------------------------|--------------------------------| | 512x4096x512 | 1228 | 1132 | | 512x512x4096 | 820 | 772 | | 4096x512x512 | 4628 | 4244 | This optimization is orthogonal to DMA chaining #931. --------- Co-authored-by: James Newling --- .../AMDAIEControlCodeToTransaction.cpp | 6 +- .../Transforms/AMDAIEFoldDmaWaits.cpp | 198 ++++++++++++++++ .../iree-amd-aie/Transforms/CMakeLists.txt | 1 + .../iree-amd-aie/Transforms/PassDetail.h | 1 + .../iree-amd-aie/Transforms/Passes.cpp | 1 + .../AMD-AIE/iree-amd-aie/Transforms/Passes.h | 3 + .../AMD-AIE/iree-amd-aie/Transforms/Passes.td | 6 + .../Transforms/test/CMakeLists.txt | 1 + .../test/controlcode_to_transaction.mlir | 4 +- .../Transforms/test/fold_dma_waits.mlir | 222 ++++++++++++++++++ .../aie_runtime/iree_aie_runtime.cc | 8 + .../aie_runtime/iree_aie_runtime.h | 2 + 12 files changed, 448 insertions(+), 5 deletions(-) create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp index db8477976..665ea08a8 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp @@ -220,9 +220,9 @@ LogicalResult convertOp(AMDAIE::NpuDmaWaitOp op, TransactionBuilder &builder) { LogicalResult convertOp(AMDAIE::NpuPushToQueueOp op, TransactionBuilder &builder) { uint32_t repeatCount = op.getRepeatCount() - 1; - if (failed(builder.appendPushToQueueOp(op.getCol(), op.getRow(), - op.getDirection(), op.getChannel(), - op.getBdId(), repeatCount, true))) { + if (failed(builder.appendPushToQueueOp( + op.getCol(), op.getRow(), op.getDirection(), op.getChannel(), + op.getBdId(), repeatCount, static_cast(op.getAsyncToken())))) { return failure(); } return success(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp new file mode 100644 index 000000000..670edeab4 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp @@ -0,0 +1,198 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h" +#include "iree-amd-aie/Transforms/Utils/AMDAIEUtils.h" +#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" +#include "mlir/IR/Iterators.h" +#define DEBUG_TYPE "iree-amdaie-fold-dma-waits" + +namespace mlir::iree_compiler::AMDAIE { + +namespace { + +/// Utility function to determine whether a DMA wait op can be folded based on +/// its half DMA copy operation. +FailureOr canFoldBasedOnHalfDmaCpy( + const AMDAIE::AMDAIEDeviceModel &deviceModel, + AMDAIE::NpuHalfDmaCpyNdOp &npuHalfDmaCpyNdOp, + DenseMap, + SmallVector> &tileConnectToBdIdQueue) { + // Retrieve the connection op. + std::optional maybeConnectionOp = + npuHalfDmaCpyNdOp.getConnectionOp(); + if (!maybeConnectionOp) { + return npuHalfDmaCpyNdOp.emitOpError() + << "expected to operate on an `amdaie.connection`"; + } + AMDAIE::ConnectionOp connectionOp = maybeConnectionOp.value(); + + // Retrieve the flow op. + std::optional maybeFlowOp = connectionOp.getFlowOp(); + if (!maybeFlowOp) { + return connectionOp->emitOpError() + << "expected to operate on an `amdaie.flow`"; + } + AMDAIE::FlowOp flowOp = maybeFlowOp.value(); + bool isPacketFlow = flowOp.getIsPacketFlow(); + + // Retrieve the BD ID op. + std::optional maybeBdIdOp = npuHalfDmaCpyNdOp.getBdIdOp(); + if (!maybeBdIdOp) { + return npuHalfDmaCpyNdOp.emitOpError() + << "must have a BD ID op to lower to " + "`amdaie.npu.write_bd`"; + } + AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value(); + + // Retrieve the tile op. + AMDAIE::TileOp tileOp = + dyn_cast_if_present(bdIdOp.getTile().getDefiningOp()); + if (!tileOp) { + return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`"; + } + + // Get the maximum queue size. + uint32_t col = getConstantIndexOrAssert(tileOp.getCol()); + uint32_t row = getConstantIndexOrAssert(tileOp.getRow()); + uint32_t maxQueueSize = deviceModel.getDmaMaxQueueSize(col, row); + + // Keep wait op if, either reaches the maximum queue size, or a + // duplicate BD ID in the same tile, or packet flow, or the queue is + // empty + uint32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue()); + bool isDuplicateBdId = + llvm::any_of(tileConnectToBdIdQueue, [&](const auto &entry) { + return entry.first.first == tileOp && + llvm::is_contained(entry.second, bdId); + }); + SmallVector &bdIdQueue = + tileConnectToBdIdQueue[{tileOp, connectionOp}]; + bool canFold = true; + if (isDuplicateBdId || isPacketFlow || bdIdQueue.size() >= maxQueueSize || + bdIdQueue.empty()) { + bdIdQueue.clear(); + canFold = false; + } + bdIdQueue.push_back(bdId); + return canFold; +} + +/// Traverses the control code in reverse, ensuring that for each connection, +/// only one DMA wait op is retained for every maximum queue size. +/// +/// Example Output: assuming a maximum queue size of 4. +/// dma_cpy_nd +/// %0 = dma_cpy_nd +/// dma_wait(%0) +/// dma_cpy_nd +/// dma_cpy_nd +/// dma_cpy_nd +/// %1 = dma_cpy_nd +/// dma_wait(%1) +/// From the bottom up, for every four DMA copy operations, only one DMA wait +/// operation is retained. +/// +/// Reverse traversal simplifies handling duplicate BD IDs, preventing +/// the need to revisit and modify earlier operations after processing later +/// ones. +LogicalResult foldDmaWaits(const AMDAIE::AMDAIEDeviceModel &deviceModel, + AMDAIE::ControlCodeOp controlCodeOp) { + IRRewriter rewriter(controlCodeOp->getContext()); + std::vector waitOpsToErase; + DenseMap, + SmallVector> + tileConnectToBdIdQueue; + // Traverse the control code in reverse. + WalkResult res = controlCodeOp->walk( + [&](AMDAIE::NpuDmaWaitOp waitOp) { + bool toErase = true; + for (Value token : waitOp.getAsyncTokens()) { + if (auto npuHalfDmaCpyNdOp = + dyn_cast_if_present( + token.getDefiningOp())) { + FailureOr result = canFoldBasedOnHalfDmaCpy( + deviceModel, npuHalfDmaCpyNdOp, tileConnectToBdIdQueue); + if (failed(result)) return WalkResult::interrupt(); + toErase &= *result; + } + } + // Erase later to avoid invalidating the iterator. + if (toErase) waitOpsToErase.push_back(waitOp); + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return failure(); + + for (AMDAIE::NpuDmaWaitOp waitOp : waitOpsToErase) { + SmallVector asyncTokens(waitOp.getAsyncTokens()); + // Erase the wait op. + rewriter.eraseOp(waitOp); + for (Value token : asyncTokens) { + if (auto op = dyn_cast_if_present( + token.getDefiningOp())) { + if (op.use_empty()) { + rewriter.setInsertionPoint(op); + TypeRange resultTypeRange = TypeRange{}; + // Nullify the result to avoid issuing a token. + rewriter.create( + op.getLoc(), resultTypeRange, op.getConnection(), op.getInput(), + op.getMixedOffsets(), op.getMixedSizes(), op.getMixedStrides(), + op.getBdId(), op.getChannel()); + rewriter.eraseOp(op); + } + } + } + } + + return success(); +} + +class AMDAIEFoldDmaWaitsPass + : public impl::AMDAIEFoldDmaWaitsBase { + public: + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + AMDAIEFoldDmaWaitsPass() = default; + AMDAIEFoldDmaWaitsPass(const AMDAIEFoldDmaWaitsPass &pass){}; + void runOnOperation() override; +}; + +void AMDAIEFoldDmaWaitsPass::runOnOperation() { + Operation *parentOp = getOperation(); + + auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp); + std::optional maybeDevice = getConfigAMDAIEDevice(targetAttr); + if (!maybeDevice) { + parentOp->emitOpError() + << "has no AMDAIEDevice in the target attribute configuration. This " + "device-specific information is required to fold DMA wait " + "ops."; + return signalPassFailure(); + } + AMDAIE::AMDAIEDeviceModel deviceModel = + AMDAIE::getDeviceModel(maybeDevice.value()); + + WalkResult res = parentOp->walk([&](AMDAIE::WorkgroupOp workgroupOp) { + AMDAIE::ControlCodeOp controlCodeOp = workgroupOp.getControlCode(); + if (failed(foldDmaWaits(deviceModel, controlCodeOp))) { + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return signalPassFailure(); +} + +} // namespace + +std::unique_ptr createAMDAIEFoldDmaWaitsPass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index 0e031e6c1..81f020f00 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -71,6 +71,7 @@ iree_cc_library( "AMDAIEDmaToCircularDma.cpp" "AMDAIEFlattenLogicalObjectFifo.cpp" "AMDAIELinalgFunctionOutlining.cpp" + "AMDAIEFoldDmaWaits.cpp" "AMDAIEFuseConsumerIntoLoop.cpp" "AMDAIEFuseFillIntoForall.cpp" "AMDAIEFusePackIntoLoop.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index f921f6d47..6cdf14d1b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -52,6 +52,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIEDMATOCIRCULARDMA #define GEN_PASS_DEF_AMDAIEFLATTENLOGICALOBJECTFIFO #define GEN_PASS_DEF_AMDAIELINALGFUNCTIONOUTLINING +#define GEN_PASS_DEF_AMDAIEFOLDDMAWAITS #define GEN_PASS_DEF_AMDAIEFUSECONSUMERINTOLOOP #define GEN_PASS_DEF_AMDAIEFUSEFILLINTOFORALL #define GEN_PASS_DEF_AMDAIEFUSEPACKINTOLOOP diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 040fa2b59..9ece915fe 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -660,6 +660,7 @@ void addAMDAIEObjectFifoLoweringPasses( passManager.addPass(createAMDAIEAssignPacketIdsPass()); passManager.addPass(createAMDAIENpuDmaToHalfDmaCpyNdPass()); + passManager.addPass(createAMDAIEFoldDmaWaitsPass()); passManager.addPass(createAMDAIEControlCodeLoweringPass()); passManager.addPass(createAMDAIEControlCodeToTransactionPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index cc8a794bc..5fefdf02f 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -204,6 +204,9 @@ std::unique_ptr createAMDAIEHoistLogicalObjFifoPass(); std::unique_ptr createAMDAIEInsertLoopsForVectorizationPass( AMDAIEInsertLoopsForVectorizationOptions options = {}); +/// Create a pass to remove redundant DMA wait operations. +std::unique_ptr createAMDAIEFoldDmaWaitsPass(); + /// Create a pass to fuse the pack operations into the for loops. std::unique_ptr createAMDAIEFusePackIntoLoopPass( AMDAIEFusePackIntoLoopOptions options = {}); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index f2680ae5e..9d01a5bf5 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -287,6 +287,12 @@ def AMDAIELinalgFunctionOutlining : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIELinalgFunctionOutliningPass()"; } +def AMDAIEFoldDmaWaits : + Pass<"iree-amdaie-fold-dma-waits", ""> { + let summary = "Remove redundant dma wait operations in controlcode."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEFoldDmaWaitsPass()"; +} + def AMDAIEFuseConsumerIntoLoop : InterfacePass<"iree-amdaie-fuse-consumer-into-loop", "mlir::FunctionOpInterface"> { let summary = "Fuse the consumer operation into the innermost last scf loop."; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index 7b604e160..5191f1c50 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -41,6 +41,7 @@ iree_lit_test_suite( "dma_loop_subsumption_circular.mlir" "dma_loop_subsumption.mlir" "dma_to_circular_dma.mlir" + "fold_dma_waits.mlir" "flatten_logical_objectfifo.mlir" "linalg_function_outlining.mlir" "fuse_consumer_into_loop.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir index f7704d4db..fa83b2028 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir @@ -75,7 +75,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: 0x00000000 // CHECK: 0x0001D214 // CHECK: 0x00000000 -// CHECK: 0x80000000 +// CHECK: 0x00000000 // CHECK: 0x00000018 // CHECK-LABEL: @push_to_queue_default_values // CHECK: npu_instructions = dense_resource : tensor<10xui32> @@ -102,7 +102,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: 0x00000000 // CHECK: 0x0601D21C // CHECK: 0x00000000 -// CHECK: 0x803F0002 +// CHECK: 0x003F0002 // CHECK: 0x00000018 // CHECK-LABEL: @push_to_queue // CHECK: npu_instructions = dense_resource : tensor<10xui32> diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir new file mode 100644 index 000000000..4032221cc --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir @@ -0,0 +1,222 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-fold-dma-waits)" --split-input-file --verify-diagnostics %s | FileCheck %s + +// expected-error @+1 {{op has no AMDAIEDevice in the target attribute configuration}} +module { + func.func @no_amdaie_device() { + amdaie.workgroup { + amdaie.controlcode { + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK-LABEL: @no_ops +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @no_ops() { + amdaie.workgroup { + amdaie.controlcode { + amdaie.end + } + } + return + } +} + +// ----- + +// Expect no DMA waits to be folded, since the same BD ID is used. +// CHECK-LABEL: @fold_dma_waits_same_bd_id +// CHECK-COUNT-2: amdaie.npu.dma_wait +// CHECK-NOT: amdaie.npu.dma_wait +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @fold_dma_waits_same_bd_id() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + amdaie.workgroup { + %tile = amdaie.tile(%c0, %c1) + %tile_0 = amdaie.tile(%c0, %c0) + %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %lock = amdaie.lock(%tile(4), 4) + %lock_2 = amdaie.lock(%tile(5), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S) + %channel_3 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM) + %3 = amdaie.flow({%channel} -> {%channel_3}) {is_packet_flow = false} + %4 = amdaie.connection(%0 {%channel_3}, %2 {%channel}, flow = %3) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %1, 64 : memref<64x32xi32> + %bd_id = amdaie.bd_id(%tile_0, %c0) + %6 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%6 : !amdaie.async_token) + %7 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%7 : !amdaie.async_token) + amdaie.end + } + } + return + } +} + +// ----- + +// DMA queue has a maximum size of 4. To optimize, starting from +// the end of the control code, retain every 4th DMA wait operation +// while folding the others. +// CHECK-LABEL: @fold_dma_waits_max_queue_size +// CHECK: %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_buffers +// CHECK: %[[CHANNEL_0:.+]] = amdaie.channel +// CHECK: %[[CHANNEL_1:.+]] = amdaie.channel +// CHECK: %[[CONNECTION:.+]] = amdaie.connection +// CHECK: %[[OBJECT_FIFO_1:.+]] = amdaie.logicalobjectfifo.from_memref +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) +// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo> +// CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID_2]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo> +// CHECK: %[[BD_ID_3:.+]] = amdaie.bd_id +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID_3]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo> +// CHECK: %[[BD_ID_4:.+]] = amdaie.bd_id +// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID_4]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token) +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @fold_dma_waits_max_queue_size() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %c4 = arith.constant 4 : index + amdaie.workgroup { + %tile = amdaie.tile(%c0, %c1) + %tile_0 = amdaie.tile(%c0, %c0) + %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %lock = amdaie.lock(%tile(4), 4) + %lock_2 = amdaie.lock(%tile(5), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S) + %channel_3 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM) + %3 = amdaie.flow({%channel} -> {%channel_3}) {is_packet_flow = false} + %4 = amdaie.connection(%0 {%channel_3}, %2 {%channel}, flow = %3) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %1, 64 : memref<64x32xi32> + %bd_id = amdaie.bd_id(%tile_0, %c0) + %6 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%6 : !amdaie.async_token) + %bd_id_1 = amdaie.bd_id(%tile_0, %c1) + %7 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_1 channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%7 : !amdaie.async_token) + %bd_id_2 = amdaie.bd_id(%tile_0, %c2) + %8 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_2 channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%8 : !amdaie.async_token) + %bd_id_3 = amdaie.bd_id(%tile_0, %c3) + %9 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_3 channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%9 : !amdaie.async_token) + %bd_id_4 = amdaie.bd_id(%tile_0, %c4) + %10 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_4 channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%10 : !amdaie.async_token) + amdaie.end + } + } + return + } +} + +// ----- + +// Two circuit connections are used, corresponding to two separate channels. +// Each channel operates with its own independent queue. +// CHECK-LABEL: @fold_dma_waits_two_connections +// CHECK: %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_buffers +// CHECK: %[[OBJECT_FIFO_1:.+]] = amdaie.logicalobjectfifo.from_buffers +// CHECK: %[[CHANNEL_0:.+]] = amdaie.channel +// CHECK: %[[CHANNEL_1:.+]] = amdaie.channel +// CHECK: %[[CHANNEL_2:.+]] = amdaie.channel +// CHECK: %[[CHANNEL_3:.+]] = amdaie.channel +// CHECK: %[[CONNECTION_0:.+]] = amdaie.connection +// CHECK: %[[CONNECTION_1:.+]] = amdaie.connection +// CHECK: %[[OBJECT_FIFO_2:.+]] = amdaie.logicalobjectfifo.from_memref +// CHECK: %[[OBJECT_FIFO_3:.+]] = amdaie.logicalobjectfifo.from_memref +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[OBJECT_FIFO_2]] [] [] [] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo> +// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_1]](%[[OBJECT_FIFO_3]] [] [] [] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL_2]]) : !amdaie.logicalobjectfifo> +// CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[OBJECT_FIFO_2]] [] [] [] bd_id = %[[BD_ID_2]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) +// CHECK: %[[BD_ID_3:.+]] = amdaie.bd_id +// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_1]](%[[OBJECT_FIFO_3]] [] [] [] bd_id = %[[BD_ID_3]] channel = %[[CHANNEL_2]]) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token) +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @fold_dma_waits_two_connections() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + amdaie.workgroup { + %tile = amdaie.tile(%c0, %c1) + %tile_0 = amdaie.tile(%c0, %c0) + %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %buffer_2 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %buffer_3 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %lock = amdaie.lock(%tile(4), 4) + %lock_4 = amdaie.lock(%tile(5), 0) + %lock_5 = amdaie.lock(%tile(6), 4) + %lock_6 = amdaie.lock(%tile(7), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_4}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_2, %buffer_3}, {%lock_5}, {%lock_6}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %5 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S) + %channel_7 = amdaie.channel(%tile_0, 1, port_type = DMA, direction = MM2S) + %channel_8 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM) + %channel_9 = amdaie.channel(%tile, 1, port_type = DMA, direction = S2MM) + %6 = amdaie.flow({%channel} -> {%channel_7}) {is_packet_flow = false} + %7 = amdaie.flow({%channel_8} -> {%channel_9}) {is_packet_flow = false} + %8 = amdaie.connection(%0 {%channel_7}, %2 {%channel}, flow = %6) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %9 = amdaie.connection(%3 {%channel_9}, %5 {%channel_8}, flow = %7) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %10 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %1, 64 : memref<64x32xi32> + %11 = amdaie.logicalobjectfifo.from_memref %4, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %4, 64 : memref<64x32xi32> + %bd_id = amdaie.bd_id(%tile_0, %c0) + %12 = amdaie.npu.half_dma_cpy_nd async %8(%10 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%12 : !amdaie.async_token) + %bd_id_1 = amdaie.bd_id(%tile_0, %c1) + %13 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_1 channel = %channel_8) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%13 : !amdaie.async_token) + %bd_id_2 = amdaie.bd_id(%tile_0, %c2) + %14 = amdaie.npu.half_dma_cpy_nd async %8(%10 [] [] [] bd_id = %bd_id_2 channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%14 : !amdaie.async_token) + %bd_id_3 = amdaie.bd_id(%tile_0, %c3) + %15 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_3 channel = %channel_8) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%15 : !amdaie.async_token) + amdaie.end + } + } + return + } +} diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc index 58d068743..bc5bca39d 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc @@ -225,6 +225,14 @@ bool AMDAIEDeviceModel::isShimTile(uint8_t col, uint8_t row) const { return row == configPtr.ShimRowNum; } +uint8_t AMDAIEDeviceModel::getDmaMaxQueueSize(uint8_t col, uint8_t row) const { + uint8_t maxQueueSize = 0; + TRY_XAIE_API_FATAL_ERROR(XAie_DmaGetMaxQueueSize, + const_cast(&devInst), + XAie_TileLoc(col, row), &maxQueueSize); + return maxQueueSize; +} + // TODO(max): these should be optionals instead of returning 0. uint32_t AMDAIEDeviceModel::getNumLocks(uint8_t col, uint8_t row) const { AMDAIETileType tileType = getTileType(col, row); diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h index 278a208bc..742eb8a59 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h @@ -315,6 +315,8 @@ struct AMDAIEDeviceModel { return *((const T *)(dmaBdMod + static_cast(dmaBdProp))); } + uint8_t getDmaMaxQueueSize(uint8_t col, uint8_t row) const; + uint32_t getNumLocks(uint8_t col, uint8_t row) const; std::optional getMemWest(TileLoc src) const;