diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp index 72ff124af..09f890879 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp @@ -1123,7 +1123,7 @@ void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value input, ArrayRef offsets, ArrayRef sizes, ArrayRef strides, Value bdId, - Value channel) { + Value channel, Value nextBd, Value startBd) { SmallVector staticOffsets, staticSizes, staticStrides; SmallVector dynamicOffsets, dynamicSizes, dynamicStrides; dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); @@ -1131,7 +1131,7 @@ void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result, dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides); build(b, result, resultTypes, connection, input, dynamicOffsets, dynamicSizes, dynamicStrides, staticOffsets, staticSizes, staticStrides, bdId, - channel); + channel, nextBd, startBd); } // Build a NpuHalfDmaCpyNdOp with static entries. @@ -1140,7 +1140,7 @@ void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value input, ArrayRef offsets, ArrayRef sizes, ArrayRef strides, mlir::Value bdId, - Value channel) { + Value channel, Value nextBd, Value startBd) { SmallVector offsetValues = llvm::to_vector<4>(llvm::map_range( offsets, [&](int64_t v) -> OpFoldResult { return b.getI64IntegerAttr(v); })); @@ -1152,7 +1152,7 @@ void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result, strides, [&](int64_t v) -> OpFoldResult { return b.getI64IntegerAttr(v); })); build(b, result, resultTypes, connection, input, offsetValues, sizeValues, - strideValues, bdId, channel); + strideValues, bdId, channel, nextBd, startBd); } // Build a NpuHalfDmaCpyNdOp with dynamic entries. @@ -1160,7 +1160,7 @@ void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result, TypeRange resultTypes, Value connection, Value input, ValueRange offsets, ValueRange sizes, ValueRange strides, mlir::Value bdId, - Value channel) { + Value channel, Value nextBd, Value startBd) { SmallVector offsetValues = llvm::to_vector<4>( llvm::map_range(offsets, [](Value v) -> OpFoldResult { return v; })); SmallVector sizeValues = llvm::to_vector<4>( @@ -1168,7 +1168,7 @@ void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result, SmallVector strideValues = llvm::to_vector<4>( llvm::map_range(strides, [](Value v) -> OpFoldResult { return v; })); build(b, result, resultTypes, connection, input, offsetValues, sizeValues, - strideValues, bdId, channel); + strideValues, bdId, channel, nextBd, startBd); } std::optional NpuHalfDmaCpyNdOp::getStaticBaseOffset() { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td index 36985c8b9..371945da7 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td @@ -595,20 +595,32 @@ def AMDAIE_NpuHalfDmaCpyNdOp ShapedType::kDynamic encodes that the corresponding entry has a dynamic value. + It also supports the representation of DMA BD chaining using the, + `next_bd`, and `start_bd` operands. The `next_bd` operand specifies + the BD ID of the next DMA operation in the chain, if there is any. + + The `start_bd` operand specifies the BD ID of the first DMA operation in a sequence. + - If `start_bd` is the same as `bd_id`, it marks the start of a chain. + - If `start_bd` differs from `bd_id` and `next_bd` is set, it represents + an intermediate operation in the chain. + - If `start_bd` differs from `bd_id` and `next_bd` is not set, it represents + the end of the chain. + Example: ```mlir %2 = amdaie.connection(%1, %0) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %bd_id = amdaie.bd_id(%tile_0_0, 0) + %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) + %bd_id_1 = amdaie.bd_id(%tile_0_0, 1) %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) ... amdaie.controlcode { %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<32x1024xi32> -> !amdaie.logicalobjectfifo> %4 = amdaie.npu.half_dma_cpy_nd async %2(%0[0, 0] [32, 64] [1024, 1] - bd_id = %bd_id channel = %channel) + bd_id = %bd_id_0 channel = %channel next_bd = %bd_id_1 start_bd = %bd_id_0) ... } ``` @@ -624,7 +636,9 @@ def AMDAIE_NpuHalfDmaCpyNdOp DenseI64ArrayAttr:$static_sizes, DenseI64ArrayAttr:$static_strides, Optional:$bd_id, - Optional:$channel + Optional:$channel, + Optional:$next_bd, + Optional:$start_bd ); let results = (outs Optional:$async_token); @@ -639,6 +653,8 @@ def AMDAIE_NpuHalfDmaCpyNdOp custom($strides, $static_strides) (`bd_id` `=` $bd_id^)? (`channel` `=` $channel^)? + (`next_bd` `=` $next_bd^)? + (`start_bd` `=` $start_bd^)? `)` attr-dict `:` type($input) @@ -649,16 +665,19 @@ def AMDAIE_NpuHalfDmaCpyNdOp OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection, "::mlir::Value":$input, "ArrayRef":$offsets, "ArrayRef":$sizes, "ArrayRef":$strides, - "::mlir::Value":$bd_id, "::mlir::Value":$channel)>, + "::mlir::Value":$bd_id, "::mlir::Value":$channel, + CArg<"::mlir::Value", "nullptr">:$next_bd, CArg<"::mlir::Value", "nullptr">:$start_bd)>, // Build a NpuHalfDmaCpyNdOp with static entries. OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection, "::mlir::Value":$target, "ArrayRef":$offsets, "ArrayRef":$sizes, "ArrayRef":$strides, - "::mlir::Value":$bd_id, "::mlir::Value":$channel)>, + "::mlir::Value":$bd_id, "::mlir::Value":$channel, + CArg<"::mlir::Value", "nullptr">:$next_bd, CArg<"::mlir::Value", "nullptr">:$start_bd)>, // Build a NpuHalfDmaCpyNdOp with dynamic entries. OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection, "::mlir::Value":$input, "ValueRange":$offsets, "ValueRange":$sizes, - "ValueRange":$strides, "::mlir::Value":$bd_id, "::mlir::Value":$channel)> + "ValueRange":$strides, "::mlir::Value":$bd_id, "::mlir::Value":$channel, + CArg<"::mlir::Value", "nullptr">:$next_bd, CArg<"::mlir::Value", "nullptr">:$start_bd)>, ]; let extraClassDeclaration = [{ @@ -674,9 +693,20 @@ def AMDAIE_NpuHalfDmaCpyNdOp } std::optional getBdIdOp() { + if (!getBdId()) return std::nullopt; return dyn_cast_if_present(getBdId().getDefiningOp()); } + std::optional getNextBdIdOp() { + if (!getNextBd()) return std::nullopt; + return dyn_cast_if_present(getNextBd().getDefiningOp()); + } + + std::optional getStartBdIdOp() { + if (!getStartBd()) return std::nullopt; + return dyn_cast_if_present(getStartBd().getDefiningOp()); + } + // Return the input `amdaie.connection` operation. std::optional getConnectionOp() { return dyn_cast_if_present(getConnection().getDefiningOp()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir index c261f099a..c542e2627 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir @@ -397,6 +397,7 @@ func.func @npu_dma_cpy_nd_all_operands(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { @@ -404,6 +405,7 @@ func.func @npu_half_dma_cpy_nd(%arg0: !amdaie.logicalobjectfifo %c1 = arith.constant 1 : index %tile_0_0 = amdaie.tile(%c0, %c0) %bd_id = amdaie.bd_id(%tile_0_0, %c0) + %bd_id_1 = amdaie.bd_id(%tile_0_0, %c1) %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = S2MM) %0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) // CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] []) : !amdaie.logicalobjectfifo> @@ -416,6 +418,8 @@ func.func @npu_half_dma_cpy_nd(%arg0: !amdaie.logicalobjectfifo amdaie.npu.half_dma_cpy_nd %0(%arg0[%c0, 0] [%c0, 64] [%c0, 1] channel = %channel) : !amdaie.logicalobjectfifo> // CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] [] bd_id = %[[BD_ID]] channel = %[[CHANNEL]]) : !amdaie.logicalobjectfifo> amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] [] bd_id = %[[BD_ID]] channel = %[[CHANNEL]] next_bd = %[[BD_ID_1]] start_bd = %[[BD_ID]]) : !amdaie.logicalobjectfifo> + amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] [] bd_id = %bd_id channel = %channel next_bd = %bd_id_1 start_bd = %bd_id) : !amdaie.logicalobjectfifo> return } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp index 60bb8144a..cf40de2b0 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp @@ -110,8 +110,13 @@ struct HalfDmaCpyNdToNpuConverter final staticStrides.insert(staticStrides.begin(), numIntraAddrDim - staticStrides.size(), 0); - bool useNextBd{false}; + bool useNextBd = false; int32_t nextBd{0}; + if (std::optional nextBdIdOp = op.getNextBdIdOp()) { + nextBd = getConstantIndexOrAssert(nextBdIdOp.value().getValue()); + useNextBd = true; + } + bool validBd{true}; int32_t lockRelVal{0}; int32_t lockRelId{0}; @@ -208,6 +213,21 @@ struct HalfDmaCpyNdToNpuConverter final strides); if (failed(npuPushToQueueOp)) return failure(); rewriter.replaceOp(op, *npuPushToQueueOp); + + std::optional nextBdIdOp = op.getNextBdIdOp(); + if (nextBdIdOp) { + // `next_bd` is set, so either at the beginning or middle of a chain. + // No need to push to the queue, just erase the op. + rewriter.eraseOp(*npuPushToQueueOp); + } else { + std::optional maybeStartBdIdOp = op.getStartBdIdOp(); + if (maybeStartBdIdOp) { + // Update with the BD ID at the start of the chain. + AMDAIE::BdIdOp startBdIdOp = maybeStartBdIdOp.value(); + uint32_t startBdId = getConstantIndexOrAssert(startBdIdOp.getValue()); + npuPushToQueueOp->setBdId(startBdId); + } + } return success(); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp index 670edeab4..2f0c6030d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp @@ -142,7 +142,7 @@ LogicalResult foldDmaWaits(const AMDAIE::AMDAIEDeviceModel &deviceModel, rewriter.create( op.getLoc(), resultTypeRange, op.getConnection(), op.getInput(), op.getMixedOffsets(), op.getMixedSizes(), op.getMixedStrides(), - op.getBdId(), op.getChannel()); + op.getBdId(), op.getChannel(), op.getNextBd(), op.getStartBd()); rewriter.eraseOp(op); } } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp new file mode 100644 index 000000000..b21ceb025 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp @@ -0,0 +1,270 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h" +#include "iree-amd-aie/Transforms/Utils/AMDAIEUtils.h" +#include "iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h" +#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" +#include "mlir/IR/Iterators.h" +#define DEBUG_TYPE "iree-amdaie-insert-dma-bd-chain" + +namespace mlir::iree_compiler::AMDAIE { + +namespace { + +using DmaChain = std::pair; + +/// Utility function to update `next_bd` and `start_bd` operands. +LogicalResult updateChainOperands( + IRRewriter &rewriter, SmallVector &dmaOps) { + // Nothing to do if the DMA chain length is one or less. + if (dmaOps.size() < 2) return success(); + + Value startBdId = dmaOps[0].getBdId(); + Operation *parentOp = dmaOps[0]->getParentOp(); + // Chain the DMA ops. + for (unsigned i = 0; i < dmaOps.size() - 1; ++i) { + AMDAIE::NpuHalfDmaCpyNdOp currDmaOp = dmaOps[i]; + if (currDmaOp->getParentOp() != parentOp) { + return currDmaOp.emitError( + "DMA operations to be chained must belong to the same scope"); + } + Value nextBdId = dmaOps[i + 1].getBdId(); + // No token is produced at the beginning or middle of a chain. + TypeRange token = TypeRange{}; + rewriter.setInsertionPointAfter(currDmaOp); + rewriter.create( + currDmaOp.getLoc(), token, currDmaOp.getConnection(), + currDmaOp.getInput(), currDmaOp.getMixedOffsets(), + currDmaOp.getMixedSizes(), currDmaOp.getMixedStrides(), + currDmaOp.getBdId(), currDmaOp.getChannel(), nextBdId, startBdId); + for (auto &use : currDmaOp->getUses()) rewriter.eraseOp(use.getOwner()); + rewriter.eraseOp(currDmaOp); + } + // Last DMA op in the chain. + AMDAIE::NpuHalfDmaCpyNdOp lastDmaOp = dmaOps.back(); + if (lastDmaOp->getParentOp() != parentOp) { + return lastDmaOp.emitError( + "DMA operations to be chained must belong to the same scope"); + } + Value nextBdId = nullptr; + rewriter.setInsertionPointAfter(lastDmaOp); + auto lastDmaOpChained = rewriter.create( + lastDmaOp.getLoc(), lastDmaOp.getResultTypes(), lastDmaOp.getConnection(), + lastDmaOp.getInput(), lastDmaOp.getMixedOffsets(), + lastDmaOp.getMixedSizes(), lastDmaOp.getMixedStrides(), + lastDmaOp.getBdId(), lastDmaOp.getChannel(), nextBdId, startBdId); + rewriter.replaceOp(lastDmaOp, lastDmaOpChained.getResults()); + return success(); +} + +/// Utility function to determine if chains can grow further +/// or require breaking. +/// +/// Example: +/// - Chain X currently holds BD IDs: [4, 5, 6, 7] +/// - Chain Y currently holds BD IDs: [0, 1, 2, 3] +/// - A new BD ID (0) needs to be added to the front (due to reverse +/// traversing) of chain X. +/// +/// Conflict resolution: +/// - Chain Y must be broken because BD ID 0 is already assigned to it +/// and must be released. +/// - Chain X is also broken to prevent the new added BD ID (0) from +/// invalidating chain Y. +/// +/// Result: +/// - Break both chains X and Y. +/// - Chain X: [0] (the newly added BD ID). +/// - Chain Y: [] (emptied after breaking). +void checkForChainsToBeBroken( + uint32_t currBdId, const DmaChain &currDmaChain, + const DenseMap> &dmaChainToBdIds, + SmallVector &chainsToBreak) { + for (auto &[entry, bdIds] : dmaChainToBdIds) { + if (entry.first == currDmaChain.first && bdIds.contains(currBdId)) { + // Break the chain that contains the duplicate BD ID. + chainsToBreak.push_back(entry); + if (entry != currDmaChain) { + // Break the current chain as well. + chainsToBreak.push_back(currDmaChain); + } + break; + } + } +} + +/// Traverse the control code in reverse order to create DMA BD chains. Reverse +/// traversal simplifies handling duplicate BD IDs, preventing the need to +/// revisit and modify earlier operations after processing later ones. +LogicalResult insertDmaBdChain(const AMDAIE::AMDAIEDeviceModel &deviceModel, + AMDAIE::ControlCodeOp controlCodeOp) { + IRRewriter rewriter(controlCodeOp->getContext()); + + // Move all BdIdOps to the beginning of the control code. + // This is to avoid dominance issues when chaining BD IDs. + SmallVector bdIdOps; + WalkResult res = controlCodeOp->walk([&](Operation *op) { + if (auto bdIdOp = dyn_cast(op)) { + bdIdOps.push_back(op); + } + return WalkResult::advance(); + }); + for (Operation *op : llvm::reverse(bdIdOps)) { + op->moveBefore(&controlCodeOp.front()); + } + + // BD IDs that have been assigned in each tile. + DenseMap> dmaChainToBdIds; + // Buffers the DMA ops that will be chained. + DenseMap> dmaChainToDmaOps; + + res = controlCodeOp->walk([&](Operation *op) { + if (auto npuHalfDmaCpyNdOp = dyn_cast(op)) { + // Not shim, will be erased at ControlcodeLowering, ignore. + if (npuHalfDmaCpyNdOp.getMemorySpaceAsUInt() != 0) { + return WalkResult::advance(); + } + + // Get the connection op. + std::optional maybeConnectionOp = + npuHalfDmaCpyNdOp.getConnectionOp(); + if (!maybeConnectionOp) { + npuHalfDmaCpyNdOp.emitOpError() + << "expected to operate on an `amdaie.connection`"; + return WalkResult::interrupt(); + } + AMDAIE::ConnectionOp connectionOp = maybeConnectionOp.value(); + + // Packet flow, do not chain BDs. + std::optional maybeFlowOp = connectionOp.getFlowOp(); + if (!maybeFlowOp) { + connectionOp->emitOpError() + << "expected to operate on an `amdaie.flow`"; + return WalkResult::interrupt(); + } + AMDAIE::FlowOp flowOp = maybeFlowOp.value(); + bool isPacketFlow = flowOp.getIsPacketFlow(); + if (isPacketFlow) return WalkResult::advance(); + + // Repeat count > 1, do not chain BDs. + int32_t repeatCount = 1; + uint8_t numAddrDim = DmaDimConfig(deviceModel, 0).maxNbDims; + SmallVector sizes = npuHalfDmaCpyNdOp.getMixedSizes(); + SmallVector strides = npuHalfDmaCpyNdOp.getMixedStrides(); + if (!sizes.empty() && !strides.empty()) { + int64_t size = getConstantIndexOrAssert(sizes[0]); + int64_t stride = getConstantIndexOrAssert(strides[0]); + if (sizes.size() == numAddrDim || stride == 0) { + repeatCount = size; + } + } + if (repeatCount > 1) return WalkResult::advance(); + + // Get the BD ID and tile op. + std::optional maybeBdIdOp = npuHalfDmaCpyNdOp.getBdIdOp(); + if (!maybeBdIdOp) { + npuHalfDmaCpyNdOp.emitOpError() << "must have a BD ID op"; + return WalkResult::interrupt(); + } + AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value(); + uint32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue()); + AMDAIE::TileOp tileOp = + dyn_cast_if_present(bdIdOp.getTile().getDefiningOp()); + if (!tileOp) { + bdIdOp.emitOpError() << "must operate on an `amdaie.tile`"; + return WalkResult::interrupt(); + } + + // Any duplicate BD ID from the same tile indicates that the chain + // cannot grow further and requires breaking to release the + // conflicting BD ID. + SmallVector chainsToBreak; + DmaChain currDmaChain = {tileOp, connectionOp}; + checkForChainsToBeBroken(bdId, currDmaChain, dmaChainToBdIds, + chainsToBreak); + + // If the chains are not to be continued, update DMA operands using + // the `updateChainOperands` function. + if (!chainsToBreak.empty()) { + for (auto &entry : chainsToBreak) { + // Since the controlcode is traversed in reverse order, we need to + // restore the original order of the DMA operations. + std::reverse(dmaChainToDmaOps[entry].begin(), + dmaChainToDmaOps[entry].end()); + if (failed(updateChainOperands(rewriter, dmaChainToDmaOps[entry]))) + WalkResult::interrupt(); + dmaChainToBdIds[entry].clear(); + dmaChainToDmaOps[entry].clear(); + } + } + dmaChainToBdIds[currDmaChain].insert(bdId); + dmaChainToDmaOps[currDmaChain].push_back(npuHalfDmaCpyNdOp); + } + return WalkResult::advance(); + }); + + // Build the remaining chains. + for (auto &[entry, _] : dmaChainToBdIds) { + // Since the controlcode is traversed in reverse order, we need to + // restore the original order of the DMA operations. + std::reverse(dmaChainToDmaOps[entry].begin(), + dmaChainToDmaOps[entry].end()); + if (failed(updateChainOperands(rewriter, dmaChainToDmaOps[entry]))) + return failure(); + } + + if (res.wasInterrupted()) return failure(); + return success(); +} + +class AMDAIEInsertDmaBdChainPass + : public impl::AMDAIEInsertDmaBdChainBase { + public: + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + AMDAIEInsertDmaBdChainPass() = default; + AMDAIEInsertDmaBdChainPass(const AMDAIEInsertDmaBdChainPass &pass){}; + void runOnOperation() override; +}; + +void AMDAIEInsertDmaBdChainPass::runOnOperation() { + Operation *parentOp = getOperation(); + + auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp); + std::optional maybeDevice = getConfigAMDAIEDevice(targetAttr); + if (!maybeDevice) { + parentOp->emitOpError() + << "has no AMDAIEDevice in the target attribute configuration. This " + "device-specific information is required to lower control code " + "ops."; + return signalPassFailure(); + } + AMDAIE::AMDAIEDeviceModel deviceModel = + AMDAIE::getDeviceModel(maybeDevice.value()); + + WalkResult res = parentOp->walk([&](AMDAIE::WorkgroupOp workgroupOp) { + AMDAIE::ControlCodeOp controlCodeOp = workgroupOp.getControlCode(); + if (failed(insertDmaBdChain(deviceModel, controlCodeOp))) { + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return signalPassFailure(); +} + +} // namespace + +std::unique_ptr createAMDAIEInsertDmaBdChainPass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index 81f020f00..012c004c0 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -78,6 +78,7 @@ iree_cc_library( "AMDAIEHoistForAffineApply.cpp" "AMDAIEHoistLogicalObjFifo.cpp" "AMDAIEInsertCores.cpp" + "AMDAIEInsertDmaBdChain.cpp" "AMDAIEInsertInfiniteLoopAroundCoreBlock.cpp" "AMDAIEInsertLoopsForVectorization.cpp" "AMDAIELinkExecutables.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 6cdf14d1b..0fbed8c81 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -60,6 +60,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIEHOISTLOGICALOBJFIFO #define GEN_PASS_DEF_AMDAIEINSERTAIEWORKGROUP #define GEN_PASS_DEF_AMDAIEINSERTCORES +#define GEN_PASS_DEF_AMDAIEINSERTDMABDCHAIN #define GEN_PASS_DEF_AMDAIEINSERTINFINITELOOPAROUNDCOREBLOCK #define GEN_PASS_DEF_AMDAIEINSERTLOOPSFORVECTORIZATION #define GEN_PASS_DEF_AMDAIELINKEXECUTABLES diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index c657cef51..55d2bd4d9 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -660,6 +660,7 @@ void addAMDAIEObjectFifoLoweringPasses( passManager.addPass(createAMDAIEAssignPacketIdsPass()); passManager.addPass(createAMDAIENpuDmaToHalfDmaCpyNdPass()); + passManager.addPass(createAMDAIEInsertDmaBdChainPass()); passManager.addPass(createAMDAIEFoldDmaWaitsPass()); passManager.addPass(createAMDAIEControlCodeLoweringPass()); passManager.addPass(createAMDAIEControlCodeToTransactionPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index 5fefdf02f..c2ce74ac7 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -199,6 +199,9 @@ std::unique_ptr createAMDAIEHoistForLoopAffineApplyPass(); /// operands. std::unique_ptr createAMDAIEHoistLogicalObjFifoPass(); +/// Create pass to chain DMA BD IDs by updating next_bd operands. +std::unique_ptr createAMDAIEInsertDmaBdChainPass(); + /// Create a pass to transform linalg.generics into a form which benefits later /// vectorization passes (to vector and aievec dialects). std::unique_ptr createAMDAIEInsertLoopsForVectorizationPass( diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index f7ac2c8a1..96d80a184 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -336,6 +336,13 @@ def AMDAIEInsertCores : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEInsertCoresPass()"; } +def AMDAIEInsertDmaBdChain : + Pass<"iree-amdaie-insert-dma-bd-chain"> { + let summary = "Chain DMA BD IDs by updating next_bd operands."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEInsertDmaBdChainPass()"; +} + + def AMDAIEInsertInfiniteLoopAroundCoreBlock : Pass<"iree-amdaie-insert-infinite-loop-around-core-block", ""> { let summary = "Inserts an infinite loop around each `amdaie.core`'s block."; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index 5191f1c50..943aa2dcc 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -50,6 +50,7 @@ iree_lit_test_suite( "hoist_for_affine_apply.mlir" "hoist_logical_obj_fifo.mlir" "insert_cores.mlir" + "insert_dma_bd_chain.mlir" "insert_infinite_loop_around_core_block.mlir" "insert_loops_for_vectorization.mlir" "localize_logical_objectfifo.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir index 6a9bfb85d..26bad8b3b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir @@ -128,3 +128,53 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} return } } + +// ----- + +// CHECK-LABEL: @half_npu_dma_cpy_nd_chain +// CHECK: amdaie.controlcode +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @half_npu_dma_cpy_nd_chain() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + amdaie.workgroup { + %tile = amdaie.tile(%c0, %c1) + %tile_0 = amdaie.tile(%c0, %c0) + %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %lock = amdaie.lock(%tile(4), 4) + %lock_2 = amdaie.lock(%tile(5), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S) + %channel_3 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM) + %3 = amdaie.flow({%channel} -> {%channel_3}) {is_packet_flow = false} + %4 = amdaie.connection(%0 {%channel_3}, %2 {%channel}, flow = %3) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %1, 64 : memref<64x32xi32> + %bd_id = amdaie.bd_id(%tile_0, %c0) + %bd_id_1 = amdaie.bd_id(%tile_0, %c1) + %bd_id_2 = amdaie.bd_id(%tile_0, %c2) +// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 1 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = true, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} + amdaie.npu.half_dma_cpy_nd %4(%5[] [] [] bd_id = %bd_id channel = %channel next_bd = %bd_id_1 start_bd = %bd_id) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.write_bd {bd_id = 1 : ui32, buffer_length = 2048 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 2 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = true, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 1 : ui32, col = 0 : ui32, offset = 0 : ui32} + amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id_1 channel = %channel next_bd = %bd_id_2 start_bd = %bd_id) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.write_bd {bd_id = 2 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 2 : ui32, col = 0 : ui32, offset = 0 : ui32} +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32} +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) + %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id_2 channel = %channel start_bd = %bd_id) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%6 : !amdaie.async_token) + amdaie.end + } + } + return + } +} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_dma_bd_chain.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_dma_bd_chain.mlir new file mode 100644 index 000000000..b3e85ab1f --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_dma_bd_chain.mlir @@ -0,0 +1,284 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-insert-dma-bd-chain)" --split-input-file --verify-diagnostics %s | FileCheck %s + +// Expect a single DMA BD chain, containing the IDs: [0, 1]. +// CHECK-LABEL: @single_bd_chain +// CHECK: %[[CHANNEL:.+]] = amdaie.channel +// CHECK: %[[CONNECTION:.+]] = amdaie.connection +// CHECK: amdaie.controlcode +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id +// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id +// CHECK: %[[OBJECT_FIFO:.+]] = amdaie.logicalobjectfifo.from_memref +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION]](%[[OBJECT_FIFO]] [] [] [] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL]] next_bd = %[[BD_ID_1]] start_bd = %[[BD_ID_0]]) +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO]] [] [] [] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL]] start_bd = %[[BD_ID_0]]) +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @single_bd_chain() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + amdaie.workgroup { + %tile = amdaie.tile(%c0, %c0) + %tile_0 = amdaie.tile(%c0, %c1) + %buffer = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32> + %buffer_2 = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32> + %lock = amdaie.lock(%tile_0(0), 0) + %lock_3 = amdaie.lock(%tile_0(1), 0) + %channel = amdaie.channel(%tile, 0, port_type = DMA, direction = MM2S) + %channel_4 = amdaie.channel(%tile_0, 0, port_type = DMA, direction = S2MM) + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<512x512xbf16> + %1 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_2}, {%lock}, {%lock_3}) : memref<1024xbf16, 1 : i32>, memref<1024xbf16, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %2 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo> + %3 = amdaie.flow({%channel} -> {%channel_4}) {is_packet_flow = false} + %4 = amdaie.connection(%1 {%channel_4}, %2 {%channel}, flow = %3) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + memref.assume_alignment %0, 64 : memref<512x512xbf16> + %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<512x512xbf16> -> !amdaie.logicalobjectfifo> + %bd_id = amdaie.bd_id(%tile, %c0) + %6 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id channel = %channel start_bd = %bd_id) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%6 : !amdaie.async_token) + %bd_id_1 = amdaie.bd_id(%tile, %c1) + %7 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_1 channel = %channel start_bd = %bd_id_1) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%7 : !amdaie.async_token) + amdaie.end + } + } + return + } +} + +// ----- + +// Expect no chaining happens, when repeat_count > 1. +// CHECK-LABEL: @no_bd_chain_repeat_count +// CHECK: %[[CHANNEL:.+]] = amdaie.channel +// CHECK: %[[CONNECTION:.+]] = amdaie.connection +// CHECK: amdaie.controlcode +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id +// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id +// CHECK: %[[OBJECT_FIFO:.+]] = amdaie.logicalobjectfifo.from_memref +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO]] [0, 0] [2, 1] [0, 1] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL]] start_bd = %[[BD_ID_0]]) +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) +// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO]] [0, 0] [2, 1] [0, 1] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL]] start_bd = %[[BD_ID_1]]) +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token) +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @no_bd_chain_repeat_count() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + amdaie.workgroup { + %tile = amdaie.tile(%c0, %c0) + %tile_0 = amdaie.tile(%c0, %c1) + %buffer = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32> + %buffer_2 = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32> + %lock = amdaie.lock(%tile_0(0), 0) + %lock_3 = amdaie.lock(%tile_0(1), 0) + %channel = amdaie.channel(%tile, 0, port_type = DMA, direction = MM2S) + %channel_4 = amdaie.channel(%tile_0, 0, port_type = DMA, direction = S2MM) + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<512x512xbf16> + %1 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_2}, {%lock}, {%lock_3}) : memref<1024xbf16, 1 : i32>, memref<1024xbf16, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %2 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo> + %3 = amdaie.flow({%channel} -> {%channel_4}) {is_packet_flow = false} + %4 = amdaie.connection(%1 {%channel_4}, %2 {%channel}, flow = %3) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + memref.assume_alignment %0, 64 : memref<512x512xbf16> + %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<512x512xbf16> -> !amdaie.logicalobjectfifo> + %bd_id = amdaie.bd_id(%tile, %c0) + %6 = amdaie.npu.half_dma_cpy_nd async %4(%5 [0, 0] [2, 1] [0, 1] bd_id = %bd_id channel = %channel start_bd = %bd_id) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%6 : !amdaie.async_token) + %bd_id_1 = amdaie.bd_id(%tile, %c1) + %7 = amdaie.npu.half_dma_cpy_nd async %4(%5 [0, 0] [2, 1] [0, 1] bd_id = %bd_id_1 channel = %channel start_bd = %bd_id_1) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%7 : !amdaie.async_token) + amdaie.end + } + } + return + } +} + +// ----- + +// Expect the test to fail, as controlcode loop is not unrolled. +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @error_different_scopes() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + amdaie.workgroup { + %tile = amdaie.tile(%c0, %c0) + %tile_0 = amdaie.tile(%c0, %c1) + %buffer = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32> + %buffer_2 = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32> + %lock = amdaie.lock(%tile_0(0), 0) + %lock_3 = amdaie.lock(%tile_0(1), 0) + %channel = amdaie.channel(%tile, 0, port_type = DMA, direction = MM2S) + %channel_4 = amdaie.channel(%tile_0, 0, port_type = DMA, direction = S2MM) + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<512x512xbf16> + %1 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_2}, {%lock}, {%lock_3}) : memref<1024xbf16, 1 : i32>, memref<1024xbf16, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %2 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo> + %3 = amdaie.flow({%channel} -> {%channel_4}) {is_packet_flow = false} + %4 = amdaie.connection(%1 {%channel_4}, %2 {%channel}, flow = %3) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + memref.assume_alignment %0, 64 : memref<512x512xbf16> + %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<512x512xbf16> -> !amdaie.logicalobjectfifo> + %bd_id = amdaie.bd_id(%tile, %c0) + %6 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id channel = %channel start_bd = %bd_id) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%6 : !amdaie.async_token) + scf.for %i = %c0 to %c1 step %c8 { + %bd_id_1 = amdaie.bd_id(%tile, %c1) + // expected-error @+1 {{DMA operations to be chained must belong to the same scope}} + %7 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_1 channel = %channel start_bd = %bd_id_1) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%7 : !amdaie.async_token) + } + amdaie.end + } + } + return + } +} + +// ----- + +// Expect two BD ID chains, as the chain breaks whenever duplicate BD ID occurs. +// The first chain: [0, 1, 2]. The second chain: [1, 2]. +// CHECK-LABEL: @duplicate_bd_id +// CHECK: %[[CHANNEL:.+]] = amdaie.channel +// CHECK: %[[CONNECTION:.+]] = amdaie.connection +// CHECK: amdaie.controlcode +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id +// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id +// CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id +// CHECK: %[[OBJECT_FIFO:.+]] = amdaie.logicalobjectfifo.from_memref +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION]](%[[OBJECT_FIFO]] [] [] [] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL]] next_bd = %[[BD_ID_1]] start_bd = %[[BD_ID_0]]) +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION]](%[[OBJECT_FIFO]] [] [] [] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL]] next_bd = %[[BD_ID_2]] start_bd = %[[BD_ID_0]]) +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO]] [] [] [] bd_id = %[[BD_ID_2]] channel = %[[CHANNEL]] start_bd = %[[BD_ID_0]]) +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION]](%[[OBJECT_FIFO]] [] [] [] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL]] next_bd = %[[BD_ID_2]] start_bd = %[[BD_ID_1]]) +// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO]] [] [] [] bd_id = %[[BD_ID_2]] channel = %[[CHANNEL]] start_bd = %[[BD_ID_1]]) +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token) +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @duplicate_bd_id() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + amdaie.workgroup { + %tile = amdaie.tile(%c0, %c0) + %tile_0 = amdaie.tile(%c0, %c1) + %buffer = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32> + %buffer_2 = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32> + %lock = amdaie.lock(%tile_0(0), 0) + %lock_3 = amdaie.lock(%tile_0(1), 0) + %channel = amdaie.channel(%tile, 0, port_type = DMA, direction = MM2S) + %channel_4 = amdaie.channel(%tile_0, 0, port_type = DMA, direction = S2MM) + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<512x512xbf16> + %1 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_2}, {%lock}, {%lock_3}) : memref<1024xbf16, 1 : i32>, memref<1024xbf16, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %2 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo> + %3 = amdaie.flow({%channel} -> {%channel_4}) {is_packet_flow = false} + %4 = amdaie.connection(%1 {%channel_4}, %2 {%channel}, flow = %3) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + memref.assume_alignment %0, 64 : memref<512x512xbf16> + %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<512x512xbf16> -> !amdaie.logicalobjectfifo> + %bd_id = amdaie.bd_id(%tile, %c0) + %6 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id channel = %channel start_bd = %bd_id) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%6 : !amdaie.async_token) + %bd_id_1 = amdaie.bd_id(%tile, %c1) + %7 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_1 channel = %channel start_bd = %bd_id_1) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%7 : !amdaie.async_token) + %bd_id_2 = amdaie.bd_id(%tile, %c2) + %8 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_2 channel = %channel start_bd = %bd_id_2) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%8 : !amdaie.async_token) + %9 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_1 channel = %channel start_bd = %bd_id_1) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%9 : !amdaie.async_token) + %10 = amdaie.npu.half_dma_cpy_nd async %4(%5 [] [] [] bd_id = %bd_id_2 channel = %channel start_bd = %bd_id_2) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%10 : !amdaie.async_token) + amdaie.end + } + } + return + } +} + +// ----- + +// Expect two DMA BD chains interleaved, as they belong to different connections. +// One chain contains the IDs: [0, 2], the other chain contains: [1, 3]. +// CHECK-LABEL: @two_connections +// CHECK: %[[CHANNEL_0:.+]] = amdaie.channel +// CHECK: %[[CHANNEL_1:.+]] = amdaie.channel +// CHECK: %[[CHANNEL_2:.+]] = amdaie.channel +// CHECK: %[[CHANNEL_3:.+]] = amdaie.channel +// CHECK: %[[CONNECTION_0:.+]] = amdaie.connection +// CHECK: %[[CONNECTION_1:.+]] = amdaie.connection +// CHECK: amdaie.controlcode +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id +// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id +// CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id +// CHECK: %[[BD_ID_3:.+]] = amdaie.bd_id +// CHECK: %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_memref +// CHECK: %[[OBJECT_FIFO_1:.+]] = amdaie.logicalobjectfifo.from_memref +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[OBJECT_FIFO_0]] [] [] [] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL_0]] next_bd = %[[BD_ID_2]] start_bd = %[[BD_ID_0]]) +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_1]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL_2]] next_bd = %[[BD_ID_3]] start_bd = %[[BD_ID_1]]) +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[OBJECT_FIFO_0]] [] [] [] bd_id = %[[BD_ID_2]] channel = %[[CHANNEL_0]] start_bd = %[[BD_ID_0]]) +// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_1]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID_3]] channel = %[[CHANNEL_2]] start_bd = %[[BD_ID_1]]) +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token) +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @two_connections() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + amdaie.workgroup { + %tile = amdaie.tile(%c0, %c0) + %tile_0 = amdaie.tile(%c0, %c1) + %buffer = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32> + %buffer_4 = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32> + %buffer_5 = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32> + %buffer_6 = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32> + %lock = amdaie.lock(%tile_0(0), 0) + %lock_7 = amdaie.lock(%tile_0(1), 0) + %lock_8 = amdaie.lock(%tile_0(2), 0) + %lock_9 = amdaie.lock(%tile_0(3), 0) + %channel = amdaie.channel(%tile, 0, port_type = DMA, direction = MM2S) + %channel_10 = amdaie.channel(%tile_0, 0, port_type = DMA, direction = S2MM) + %channel_11 = amdaie.channel(%tile, 1, port_type = DMA, direction = MM2S) + %channel_12 = amdaie.channel(%tile_0, 1, port_type = DMA, direction = S2MM) + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<512x512xbf16> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<512x512xbf16> + %2 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_4}, {%lock}, {%lock_7}) : memref<1024xbf16, 1 : i32>, memref<1024xbf16, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %3 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo> + %4 = amdaie.flow({%channel} -> {%channel_10}) {is_packet_flow = false} + %5 = amdaie.connection(%2 {%channel_10}, %3 {%channel}, flow = %4) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %6 = amdaie.logicalobjectfifo.from_buffers({%buffer_5, %buffer_6}, {%lock_8}, {%lock_9}) : memref<1024xbf16, 1 : i32>, memref<1024xbf16, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %7 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo> + %8 = amdaie.flow({%channel_11} -> {%channel_12}) {is_packet_flow = false} + %9 = amdaie.connection(%6 {%channel_11}, %7 {%channel_12}, flow = %8) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + memref.assume_alignment %0, 64 : memref<512x512xbf16> + %10 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<512x512xbf16> -> !amdaie.logicalobjectfifo> + %11 = amdaie.logicalobjectfifo.from_memref %1, {%tile} : memref<512x512xbf16> -> !amdaie.logicalobjectfifo> + %bd_id = amdaie.bd_id(%tile, %c0) + %12 = amdaie.npu.half_dma_cpy_nd async %5(%10 [] [] [] bd_id = %bd_id channel = %channel start_bd = %bd_id) : !amdaie.logicalobjectfifo> + %bd_id_1 = amdaie.bd_id(%tile, %c1) + %13 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_1 channel = %channel_11 start_bd = %bd_id_1) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%12 : !amdaie.async_token) + amdaie.npu.dma_wait(%13 : !amdaie.async_token) + %bd_id_2 = amdaie.bd_id(%tile, %c2) + %14 = amdaie.npu.half_dma_cpy_nd async %5(%10 [] [] [] bd_id = %bd_id_2 channel = %channel start_bd = %bd_id_2) : !amdaie.logicalobjectfifo> + %bd_id_3 = amdaie.bd_id(%tile, %c3) + %15 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_3 channel = %channel_11 start_bd = %bd_id_3) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%14 : !amdaie.async_token) + amdaie.npu.dma_wait(%15 : !amdaie.async_token) + amdaie.end + } + } + return + } +}