diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp index be241be1c..ac0cfb3cc 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp @@ -1101,7 +1101,7 @@ void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value input, ArrayRef offsets, ArrayRef sizes, ArrayRef strides, Value bdId, - Value channel, bool useNextBd, Value nextBd, + Value channel, BoolAttr useNextBd, Value nextBd, Value startBd) { SmallVector staticOffsets, staticSizes, staticStrides; SmallVector dynamicOffsets, dynamicSizes, dynamicStrides; @@ -1119,7 +1119,7 @@ void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value input, ArrayRef offsets, ArrayRef sizes, ArrayRef strides, mlir::Value bdId, - Value channel, bool useNextBd, Value nextBd, + Value channel, BoolAttr useNextBd, Value nextBd, Value startBd) { SmallVector offsetValues = llvm::to_vector<4>(llvm::map_range( offsets, @@ -1140,7 +1140,7 @@ void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result, TypeRange resultTypes, Value connection, Value input, ValueRange offsets, ValueRange sizes, ValueRange strides, mlir::Value bdId, - Value channel, bool useNextBd, Value nextBd, + Value channel, BoolAttr useNextBd, Value nextBd, Value startBd) { SmallVector offsetValues = llvm::to_vector<4>( llvm::map_range(offsets, [](Value v) -> OpFoldResult { return v; })); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td index d75bd8341..5e26c9a68 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td @@ -595,10 +595,15 @@ def AMDAIE_NpuHalfDmaCpyNdOp `next_bd`, and `start_bd` operands. The `use_next_bd` operand indicates whether another DMA operation is chained to follow this one. If `use_next_bd` is `true`, the `next_bd` operand specifies the BD ID of - the next DMA operation in the chain. Within a chain, the `start_bd` operand - identifies the BD ID of the first DMA operation in the sequence. - When `use_next_bd` is `false`, the `start_bd` is set to the same value as `bd_id`. - + the next DMA operation in the chain. + + The `start_bd` operand specifies the BD ID of the first DMA operation in a sequence. + - If `start_bd` is the same as `bd_id`, it marks the start of a chain. + - If `start_bd` differs from `bd_id` and `use_next_bd` is `true`, it represents + an intermediate operation in the chain. + - If `start_bd` differs from `bd_id` and `use_next_bd` is `false`, it represents + the end of the chain. + Example: ```mlir @@ -629,7 +634,7 @@ def AMDAIE_NpuHalfDmaCpyNdOp DenseI64ArrayAttr:$static_strides, Optional:$bd_id, Optional:$channel, - BoolAttr:$use_next_bd, + OptionalAttr:$use_next_bd, Optional:$next_bd, Optional:$start_bd ); @@ -646,7 +651,7 @@ def AMDAIE_NpuHalfDmaCpyNdOp custom($strides, $static_strides) (`bd_id` `=` $bd_id^)? (`channel` `=` $channel^)? - `use_next_bd` `=` $use_next_bd + (`use_next_bd` `=` $use_next_bd^)? (`next_bd` `=` $next_bd^)? (`start_bd` `=` $start_bd^)? `)` @@ -660,18 +665,18 @@ def AMDAIE_NpuHalfDmaCpyNdOp "::mlir::Value":$input, "ArrayRef":$offsets, "ArrayRef":$sizes, "ArrayRef":$strides, "::mlir::Value":$bd_id, "::mlir::Value":$channel, - "bool":$use_next_bd, "::mlir::Value":$next_bd, "::mlir::Value":$start_bd)>, + "::mlir::BoolAttr":$use_next_bd, "::mlir::Value":$next_bd, "::mlir::Value":$start_bd)>, // Build a NpuHalfDmaCpyNdOp with static entries. OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection, "::mlir::Value":$target, "ArrayRef":$offsets, "ArrayRef":$sizes, "ArrayRef":$strides, "::mlir::Value":$bd_id, "::mlir::Value":$channel, - "bool":$use_next_bd, "::mlir::Value":$next_bd, "::mlir::Value":$start_bd)>, + "::mlir::BoolAttr":$use_next_bd, "::mlir::Value":$next_bd, "::mlir::Value":$start_bd)>, // Build a NpuHalfDmaCpyNdOp with dynamic entries. OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection, "::mlir::Value":$input, "ValueRange":$offsets, "ValueRange":$sizes, "ValueRange":$strides, "::mlir::Value":$bd_id, "::mlir::Value":$channel, - "bool":$use_next_bd, "::mlir::Value":$next_bd, "::mlir::Value":$start_bd)>, + "::mlir::BoolAttr":$use_next_bd, "::mlir::Value":$next_bd, "::mlir::Value":$start_bd)>, ]; let extraClassDeclaration = [{ @@ -687,6 +692,7 @@ def AMDAIE_NpuHalfDmaCpyNdOp } std::optional getBdIdOp() { + if (!getBdId()) return std::nullopt; return dyn_cast_if_present(getBdId().getDefiningOp()); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir index 059bccaab..1e9d82ec5 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir @@ -397,6 +397,7 @@ func.func @npu_dma_cpy_nd_all_operands(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { @@ -404,18 +405,21 @@ func.func @npu_half_dma_cpy_nd(%arg0: !amdaie.logicalobjectfifo %c1 = arith.constant 1 : index %tile_0_0 = amdaie.tile(%c0, %c0) %bd_id = amdaie.bd_id(%tile_0_0, %c0) + %bd_id_1 = amdaie.bd_id(%tile_0_0, %c1) %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = S2MM) %0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] [] use_next_bd = false) : !amdaie.logicalobjectfifo> - amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] [] use_next_bd = false) : !amdaie.logicalobjectfifo> -// CHECK: %{{.+}} = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[ARG0]] [] [] [] use_next_bd = false) : !amdaie.logicalobjectfifo> - amdaie.npu.half_dma_cpy_nd async %0(%arg0[] [] [] use_next_bd = false) : !amdaie.logicalobjectfifo> -// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [0] [1024] [1] bd_id = %[[BD_ID]] use_next_bd = false) : !amdaie.logicalobjectfifo> - amdaie.npu.half_dma_cpy_nd %0(%arg0[0] [1024] [1] bd_id = %bd_id use_next_bd = false) : !amdaie.logicalobjectfifo> -// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [%[[C0]], 0] [%[[C0]], 64] [%[[C0]], 1] channel = %[[CHANNEL]] use_next_bd = false) : !amdaie.logicalobjectfifo> - amdaie.npu.half_dma_cpy_nd %0(%arg0[%c0, 0] [%c0, 64] [%c0, 1] channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] []) : !amdaie.logicalobjectfifo> + amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] []) : !amdaie.logicalobjectfifo> +// CHECK: %{{.+}} = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[ARG0]] [] [] []) : !amdaie.logicalobjectfifo> + amdaie.npu.half_dma_cpy_nd async %0(%arg0[] [] []) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [0] [1024] [1] bd_id = %[[BD_ID]]) : !amdaie.logicalobjectfifo> + amdaie.npu.half_dma_cpy_nd %0(%arg0[0] [1024] [1] bd_id = %bd_id) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [%[[C0]], 0] [%[[C0]], 64] [%[[C0]], 1] channel = %[[CHANNEL]]) : !amdaie.logicalobjectfifo> + amdaie.npu.half_dma_cpy_nd %0(%arg0[%c0, 0] [%c0, 64] [%c0, 1] channel = %channel) : !amdaie.logicalobjectfifo> // CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] [] bd_id = %[[BD_ID]] channel = %[[CHANNEL]] use_next_bd = false) : !amdaie.logicalobjectfifo> amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] [] bd_id = %bd_id channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] [] bd_id = %[[BD_ID]] channel = %[[CHANNEL]] use_next_bd = true next_bd = %[[BD_ID_1]] start_bd = %[[BD_ID]]) : !amdaie.logicalobjectfifo> + amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] [] bd_id = %bd_id channel = %channel use_next_bd = true next_bd = %bd_id_1 start_bd = %bd_id) : !amdaie.logicalobjectfifo> return } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp index 6c25389a6..e55d413fd 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp @@ -109,7 +109,7 @@ struct HalfDmaCpyNdToNpuConverter final staticStrides.insert(staticStrides.begin(), numIntraAddrDim - staticStrides.size(), 0); - bool useNextBd = op.getUseNextBd(); + bool useNextBd = op.getUseNextBd().value_or(false); int32_t nextBd{0}; if (useNextBd) { std::optional nextBdIdOp = op.getNextBdIdOp(); @@ -216,19 +216,18 @@ struct HalfDmaCpyNdToNpuConverter final if (failed(npuPushToQueueOp)) return failure(); rewriter.replaceOp(op, *npuPushToQueueOp); - bool useNextBd = op.getUseNextBd(); - if (useNextBd) - // Erase if not end of chain. + bool useNextBd = op.getUseNextBd().value_or(false); + if (useNextBd) { + // `useNextBd` is true, so either at the beginning or middle of a chain. + // No need to push to the queue, just erase the op. rewriter.eraseOp(*npuPushToQueueOp); - else { + } else { std::optional maybeStartBdIdOp = op.getStartBdIdOp(); if (maybeStartBdIdOp) { - // Update the BD ID with the start of the chain. - uint32_t startBdId = - getConstantIndexOrAssert(maybeStartBdIdOp.value().getValue()); - uint32_t bdId = - getConstantIndexOrAssert(maybeBdIdOp.value().getValue()); - if (startBdId != bdId) npuPushToQueueOp->setBdId(startBdId); + // Update with the BD ID at the start of the chain. + AMDAIE::BdIdOp startBdIdOp = maybeStartBdIdOp.value(); + uint32_t startBdId = getConstantIndexOrAssert(startBdIdOp.getValue()); + npuPushToQueueOp->setBdId(startBdId); } } return success(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaComposition.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaComposition.cpp index 46e93f88a..beec230f9 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaComposition.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaComposition.cpp @@ -67,12 +67,6 @@ void AMDAIEDmaCompositionPass::runOnOperation() { "after strided op composition"; return signalPassFailure(); } - - if (failed(moveNpuSourceDmaSyncAfterTargetDmaCpy(rewriter, parentOp))) { - parentOp->emitOpError() - << "failed to move source DMA sync after target DMA copy"; - return signalPassFailure(); - } } } // namespace diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp index 1ec0608ee..11ddd5b60 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp @@ -5,9 +5,9 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "iree-amd-aie/IR/AMDAIEOps.h" -#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h" -#include "iree-amd-aie/Transforms/AMDAIEUtils.h" #include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h" +#include "iree-amd-aie/Transforms/Utils/AMDAIEUtils.h" #include "iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h" #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" #include "mlir/IR/Iterators.h" @@ -17,30 +17,88 @@ namespace mlir::iree_compiler::AMDAIE { namespace { -LogicalResult insertDmaBdChain(AMDAIE::AMDAIEDeviceModel deviceModel, - AMDAIE::WorkgroupOp workgroupOp) { - IRRewriter rewriter(workgroupOp->getContext()); - - // TODO(Zhewen): to get rid of tileArgIdxToAssignedBdIdOps and - // tileArgIdxToDmaCount, integrate BD ID assignment and (partial) control code - // loop unrolling into this pass. - - // BD ID that are currenly assigned to DMA operations - DenseMap, SmallVector> - tileArgIdxToAssignedBdIdOps; - // Counter for the number of DMA operations, helping determine the dependency - DenseMap, uint32_t> tileArgIdxToDmaCount; +using TileConnect = std::pair; + +/// Utility function to update `use_next_bd`, `next_bd` and `start_bd` operands. +void updateChainOperands(IRRewriter &rewriter, + SmallVector &dmaChain) { + if (dmaChain.size() < 2) return; + + // Chain the DMA ops. + Value startBdId = dmaChain[0].getBdId(); + for (unsigned i = 0; i < dmaChain.size() - 1; ++i) { + AMDAIE::NpuHalfDmaCpyNdOp currDmaOp = dmaChain[i]; + Value nextBd = dmaChain[i + 1].getBdId(); + BoolAttr useNextBd = rewriter.getBoolAttr(true); + // No token is produced at the beginning or middle of a chain. + TypeRange token = TypeRange{}; + rewriter.setInsertionPointAfter(currDmaOp); + rewriter.create( + currDmaOp.getLoc(), token, currDmaOp.getConnection(), + currDmaOp.getInput(), currDmaOp.getMixedOffsets(), + currDmaOp.getMixedSizes(), currDmaOp.getMixedStrides(), + currDmaOp.getBdId(), currDmaOp.getChannel(), useNextBd, nextBd, + startBdId); + for (auto &use : currDmaOp->getUses()) { + rewriter.eraseOp(use.getOwner()); + } + rewriter.eraseOp(currDmaOp); + } + // Last DMA op in the chain. + AMDAIE::NpuHalfDmaCpyNdOp lastDmaOp = dmaChain.back(); + Value nextBd = nullptr; + BoolAttr useNextBd = rewriter.getBoolAttr(false); + rewriter.setInsertionPointAfter(lastDmaOp); + auto lastDmaOpChained = rewriter.create( + lastDmaOp.getLoc(), lastDmaOp.getResultTypes(), lastDmaOp.getConnection(), + lastDmaOp.getInput(), lastDmaOp.getMixedOffsets(), + lastDmaOp.getMixedSizes(), lastDmaOp.getMixedStrides(), + lastDmaOp.getBdId(), lastDmaOp.getChannel(), useNextBd, nextBd, + startBdId); + rewriter.replaceOp(lastDmaOp, lastDmaOpChained.getResults()); +} - // Last DMA operation encountered, no matter if it is chained or not - DenseMap, AMDAIE::NpuHalfDmaCpyNdOp> - tileArgIdxToLastDmaOp; - // Last DMA operation that has been chained - DenseMap, AMDAIE::NpuHalfDmaCpyNdOp> - tileArgIdxToLastChainedDmaOp; - // Black list of tile argument index pairs that should not be chained - SmallVector> tileArgIdxsBlackList; +/// Utility function to determine if chains can grow further +/// or require breaking. +/// +/// Example: +/// - Chain X currently holds BD IDs: [4, 5, 6, 7] +/// - Chain Y currently holds BD IDs: [0, 1, 2, 3] +/// - A new BD ID (0) needs to be added to the front (due to reverse +/// traversing) of chain X. +/// +/// Conflict resolution: +/// - Chain Y must be broken because BD ID 0 is already assigned to it +/// and must be released. +/// - Chain X is also broken to prevent the new added BD ID (0) from +/// invalidating chain Y. +/// +/// Result: +/// - Break both chains X and Y. +/// - Chain X: [0] (the newly added BD ID). +/// - Chain Y: [] (emptied after breaking). +void canChainGrowFurther( + const uint32_t bdId, const TileConnect &currTileConnect, + const DenseMap> &tileConnectToBdIds, + SmallVector &chainsToBreak) { + for (auto &[entry, bdIds] : tileConnectToBdIds) { + if (entry.first == currTileConnect.first && + llvm::is_contained(bdIds, bdId)) { + // Break the chain that contains the duplicate BD ID. + chainsToBreak.push_back(entry); + if (entry != currTileConnect) { + // Break the current chain as well. + chainsToBreak.push_back(currTileConnect); + } + break; + } + } +} - AMDAIE::ControlCodeOp controlCodeOp = workgroupOp.getControlCode(); +/// Traverse the control code in reverse order to create DMA BD chains. +LogicalResult insertDmaBdChain(const AMDAIE::AMDAIEDeviceModel &deviceModel, + AMDAIE::ControlCodeOp controlCodeOp) { + IRRewriter rewriter(controlCodeOp->getContext()); // Move all BdIdOps to the beginning of the control code. // This is to avoid dominance issues when chaining BD IDs. @@ -55,254 +113,106 @@ LogicalResult insertDmaBdChain(AMDAIE::AMDAIEDeviceModel deviceModel, op->moveBefore(&controlCodeOp.front()); } - // Find `NpuHalfDmaCpyNdOp` operations and chain BD IDs. - res = controlCodeOp->walk([&](Operation *op) { - if (auto npuHalfDmaCpyNdOp = dyn_cast(op)) { - // not shim, no need to chain, since it will be earsed when lowering to - // NPU instructions - if (npuHalfDmaCpyNdOp.getMemorySpaceAsUInt() != 0) { - return WalkResult::advance(); - } - - bool chaining = true; - // packet mode is enabled, do not chain BDs - std::optional maybeConnectionOp = - npuHalfDmaCpyNdOp.getConnectionOp(); - if (!maybeConnectionOp) { - npuHalfDmaCpyNdOp.emitOpError() - << "expected to operate on an `amdaie.connection`"; - return WalkResult::interrupt(); - } - std::optional maybeFlowOp = - maybeConnectionOp->getFlowOp(); - if (!maybeFlowOp) { - maybeConnectionOp->emitOpError() - << "expected to operate on an `amdaie.flow`"; - return WalkResult::interrupt(); - } - bool enablePacket = maybeFlowOp->getIsPacketFlow(); - if (enablePacket) { - chaining = false; - } - - // repeat count > 1, do not chain BDs - int32_t repeatCount = 1; - uint8_t numIntraAddrDim = deviceModel.getDmaProp( - AMDAIE::AMDAIETileType::SHIMNOC, AMDAIE::AMDAIEDmaProp::NumAddrDim); - uint8_t numAddrDim = numIntraAddrDim + kAMDAIEDmaNbInterDims; - auto sizes = npuHalfDmaCpyNdOp.getMixedSizes(); - auto strides = npuHalfDmaCpyNdOp.getMixedStrides(); - if (!sizes.empty() && !strides.empty()) { - int64_t size = getConstantIndexOrAssert(sizes[0]); - int64_t stride = getConstantIndexOrAssert(strides[0]); - if (sizes.size() == numAddrDim || stride == 0) { - repeatCount = size; - } - } - if (repeatCount > 1) { - chaining = false; - } - - // get current BD ID and tile - std::optional maybeBdIdOp = npuHalfDmaCpyNdOp.getBdIdOp(); - if (!maybeBdIdOp) { - npuHalfDmaCpyNdOp.emitOpError() << "must have a BD ID op"; - return WalkResult::interrupt(); - } - AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value(); - AMDAIE::TileOp tileOp = - dyn_cast_if_present(bdIdOp.getTile().getDefiningOp()); - if (!tileOp) { - bdIdOp.emitOpError() << "must operate on an `amdaie.tile`"; - return WalkResult::interrupt(); - } - - // get arg index - auto logicalObjFifo = - dyn_cast_if_present( - npuHalfDmaCpyNdOp.getInput().getDefiningOp()); - if (!logicalObjFifo) { - npuHalfDmaCpyNdOp.emitOpError() - << "expected input to be an " - "`amdaie.logicalobjectfifo.from_memref`"; - return WalkResult::interrupt(); - } - auto subspanOp = - dyn_cast_if_present( - logicalObjFifo.getMemref().getDefiningOp()); - if (!subspanOp) { - logicalObjFifo.emitOpError() - << "must operate on an `hal.interface.binding.subspan`"; - return WalkResult::interrupt(); - } - uint32_t argIdx = subspanOp.getBinding().getZExtValue(); - - // If the current DMA operation was previously part of the outer loop in - // the control code, force all DMA operations in the inner loop to be - // synchronized, by adding them to the black list. - tileArgIdxToDmaCount[{tileOp, argIdx}]++; - for (auto &[pair, count] : tileArgIdxToDmaCount) { - if (pair.first == tileOp && - count > tileArgIdxToDmaCount[{tileOp, argIdx}] + 1) { - if (!llvm::is_contained(tileArgIdxsBlackList, pair)) { - tileArgIdxsBlackList.push_back(pair); + // BD ID that are have been assigned in each tile. + DenseMap> tileConnectToBdIds; + // Buffers the DMA ops that will be chained. + DenseMap> + tileConnectToDmaChain; + + res = controlCodeOp->walk( + [&](Operation *op) { + if (auto npuHalfDmaCpyNdOp = dyn_cast(op)) { + // Not shim, will be earsed at ControlcodeLowering, ignore. + if (npuHalfDmaCpyNdOp.getMemorySpaceAsUInt() != 0) { + return WalkResult::advance(); } - } - } - // If the BD ID is currently used by another DMA op, stop the chain - // for that DMA op from further growing, by adding it to the black list - for (auto &[pair, bdIdOps] : tileArgIdxToAssignedBdIdOps) { - if (pair.first == tileOp && llvm::is_contained(bdIdOps, bdIdOp)) { - if (!llvm::is_contained(tileArgIdxsBlackList, pair)) { - tileArgIdxsBlackList.push_back(pair); + // Get the connection op. + std::optional maybeConnectionOp = + npuHalfDmaCpyNdOp.getConnectionOp(); + if (!maybeConnectionOp) { + npuHalfDmaCpyNdOp.emitOpError() + << "expected to operate on an `amdaie.connection`"; + return WalkResult::interrupt(); } - break; - } - } - - // If the black list is not empty, there will be a synchronization. - // Make sure all other DMA chains also break at this point to avoid - // dependency issues. - if (tileArgIdxsBlackList.size() > 0) { - for (auto &[pair, bdIdOps] : tileArgIdxToAssignedBdIdOps) { - if (pair.first == tileOp && bdIdOps.size() > 1) { - if (!llvm::is_contained(tileArgIdxsBlackList, pair)) { - tileArgIdxsBlackList.push_back(pair); + AMDAIE::ConnectionOp connectionOp = maybeConnectionOp.value(); + + // Packet flow, do not chain BDs. + std::optional maybeFlowOp = connectionOp.getFlowOp(); + if (!maybeFlowOp) { + connectionOp->emitOpError() + << "expected to operate on an `amdaie.flow`"; + return WalkResult::interrupt(); + } + AMDAIE::FlowOp flowOp = maybeFlowOp.value(); + bool isPacketFlow = flowOp.getIsPacketFlow(); + if (isPacketFlow) return WalkResult::advance(); + + // Repeat count > 1, do not chain BDs. + int32_t repeatCount = 1; + uint8_t numIntraAddrDim = deviceModel.getDmaProp( + AMDAIE::AMDAIETileType::SHIMNOC, + AMDAIE::AMDAIEDmaProp::NumAddrDim); + uint8_t numAddrDim = numIntraAddrDim + kAMDAIEDmaNbInterDims; + auto sizes = npuHalfDmaCpyNdOp.getMixedSizes(); + auto strides = npuHalfDmaCpyNdOp.getMixedStrides(); + if (!sizes.empty() && !strides.empty()) { + int64_t size = getConstantIndexOrAssert(sizes[0]); + int64_t stride = getConstantIndexOrAssert(strides[0]); + if (sizes.size() == numAddrDim || stride == 0) { + repeatCount = size; } } - } - } - - // When current DMA has not been blacklisted and a previous DMA with same - // argIdx exists, chain them together - chaining &= !llvm::is_contained(tileArgIdxsBlackList, - std::make_pair(tileOp, argIdx)) && - tileArgIdxToLastDmaOp.contains({tileOp, argIdx}); - if (chaining) { - // update the previous DMA op by changing its useNextBd and - // nextBd - AMDAIE::NpuHalfDmaCpyNdOp lastDmaOp = - tileArgIdxToLastDmaOp[{tileOp, argIdx}]; - rewriter.setInsertionPointAfter(lastDmaOp); - auto chainedDmaOp = rewriter.create( - lastDmaOp.getLoc(), lastDmaOp.getResultTypes(), - lastDmaOp.getConnection(), lastDmaOp.getInput(), - lastDmaOp.getMixedOffsets(), lastDmaOp.getMixedSizes(), - lastDmaOp.getMixedStrides(), lastDmaOp.getBdId(), - lastDmaOp.getChannel(), true, bdIdOp, lastDmaOp.getStartBd()); - rewriter.replaceOp(lastDmaOp, chainedDmaOp.getResults()); - tileArgIdxToLastChainedDmaOp[{tileOp, argIdx}] = chainedDmaOp; - // update the current DMA op by changing its startBd - rewriter.setInsertionPoint(npuHalfDmaCpyNdOp); - auto npuHalfDmaCpyNdOpNew = rewriter.create( - npuHalfDmaCpyNdOp.getLoc(), npuHalfDmaCpyNdOp.getResultTypes(), - npuHalfDmaCpyNdOp.getConnection(), npuHalfDmaCpyNdOp.getInput(), - npuHalfDmaCpyNdOp.getMixedOffsets(), - npuHalfDmaCpyNdOp.getMixedSizes(), - npuHalfDmaCpyNdOp.getMixedStrides(), npuHalfDmaCpyNdOp.getBdId(), - npuHalfDmaCpyNdOp.getChannel(), npuHalfDmaCpyNdOp.getUseNextBd(), - npuHalfDmaCpyNdOp.getNextBd(), chainedDmaOp.getStartBd()); - rewriter.replaceOp(npuHalfDmaCpyNdOp, - npuHalfDmaCpyNdOpNew.getResults()); - npuHalfDmaCpyNdOp = npuHalfDmaCpyNdOpNew; - } - - // Update BD ID assignment, if it is chaining, safely release the BD IDs - // since a synchronization will happen - if (chaining && tileArgIdxToAssignedBdIdOps.contains({tileOp, argIdx})) { - tileArgIdxToAssignedBdIdOps[{tileOp, argIdx}].push_back(bdIdOp); - } else { - tileArgIdxToAssignedBdIdOps[{tileOp, argIdx}] = {bdIdOp}; - } - - // The current DMA op is not chained with the previous DMA op (i.e. - // synchroizaiton will happen between these two ops), removing from the - // black list - if (!chaining) { - auto it = - std::find(tileArgIdxsBlackList.begin(), tileArgIdxsBlackList.end(), - std::make_pair(tileOp, argIdx)); - if (it != tileArgIdxsBlackList.end()) { - tileArgIdxsBlackList.erase(it); - } - } - // Update the last encountered DMA op - tileArgIdxToLastDmaOp[{tileOp, argIdx}] = npuHalfDmaCpyNdOp; + if (repeatCount > 1) return WalkResult::advance(); + + // Get the BD ID and tile op. + std::optional maybeBdIdOp = + npuHalfDmaCpyNdOp.getBdIdOp(); + if (!maybeBdIdOp) { + npuHalfDmaCpyNdOp.emitOpError() << "must have a BD ID op"; + return WalkResult::interrupt(); + } + AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value(); + uint32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue()); + AMDAIE::TileOp tileOp = dyn_cast_if_present( + bdIdOp.getTile().getDefiningOp()); + if (!tileOp) { + bdIdOp.emitOpError() << "must operate on an `amdaie.tile`"; + return WalkResult::interrupt(); + } - } else if (auto npuDmaWaitOp = dyn_cast(op)) { - // Handle the special case where there are multiple DMA ops preceding any - // Wait op. In such a case, some DMA ops may be chained first, before they - // are put onto the black list. Therefore, go over the black list and - // unchain the DMA ops when required. + // Any duplicate BD ID from the same tile indicates the chain cannot + // grow further and requires breaking to release the conflicting BD + // ID. + SmallVector chainsToBreak; + TileConnect currTileConnect = {tileOp, connectionOp}; + canChainGrowFurther(bdId, currTileConnect, tileConnectToBdIds, + chainsToBreak); + + // If the chains are not to be continued, update DMA operands using + // the `updateChainOperands` function. + if (!chainsToBreak.empty()) { + for (auto &entry : chainsToBreak) { + updateChainOperands(rewriter, tileConnectToDmaChain[entry]); + tileConnectToBdIds[entry].clear(); + tileConnectToDmaChain[entry].clear(); + } + } - for (auto &[tileOp, argIdx] : tileArgIdxsBlackList) { - if (tileArgIdxToLastChainedDmaOp.contains({tileOp, argIdx}) && - tileArgIdxToLastDmaOp.contains({tileOp, argIdx})) { - // break the chain lastChainedDmaOp -> lastDmaOp - AMDAIE::NpuHalfDmaCpyNdOp lastChainedDmaOp = - tileArgIdxToLastChainedDmaOp[{tileOp, argIdx}]; - AMDAIE::NpuHalfDmaCpyNdOp lastDmaOp = - tileArgIdxToLastDmaOp[{tileOp, argIdx}]; - // revert useNextBd and nextBd in lastChainedDmaOp - bool useNextBd{false}; - Value nextBd{nullptr}; - rewriter.setInsertionPointAfter(lastChainedDmaOp); - auto unchainedDmaOp = rewriter.create( - lastChainedDmaOp.getLoc(), lastChainedDmaOp.getResultTypes(), - lastChainedDmaOp.getConnection(), lastChainedDmaOp.getInput(), - lastChainedDmaOp.getMixedOffsets(), - lastChainedDmaOp.getMixedSizes(), - lastChainedDmaOp.getMixedStrides(), lastChainedDmaOp.getBdId(), - lastChainedDmaOp.getChannel(), useNextBd, nextBd, - lastChainedDmaOp.getStartBd()); - rewriter.replaceOp(lastChainedDmaOp, unchainedDmaOp.getResults()); - tileArgIdxToLastChainedDmaOp.erase({tileOp, argIdx}); - // revert startBd in lastDmaOp - auto startBd = lastDmaOp.getBdId(); - rewriter.setInsertionPoint(lastDmaOp); - unchainedDmaOp = rewriter.create( - lastDmaOp.getLoc(), lastDmaOp.getResultTypes(), - lastDmaOp.getConnection(), lastDmaOp.getInput(), - lastDmaOp.getMixedOffsets(), lastDmaOp.getMixedSizes(), - lastDmaOp.getMixedStrides(), lastDmaOp.getBdId(), - lastDmaOp.getChannel(), lastDmaOp.getUseNextBd(), - lastDmaOp.getNextBd(), startBd); - tileArgIdxToAssignedBdIdOps[{tileOp, argIdx}] = { - lastDmaOp.getBdIdOp().value()}; - rewriter.replaceOp(lastDmaOp, unchainedDmaOp.getResults()); - tileArgIdxToLastDmaOp[{tileOp, argIdx}] = unchainedDmaOp; - } else { - npuDmaWaitOp.emitError() << "unhandled situation in DMA BD chaining, " - "please try to disable this pass"; - return WalkResult::interrupt(); + // Insert at the front, as we are walking in reverse order. + tileConnectToBdIds[currTileConnect].insert( + tileConnectToBdIds[currTileConnect].begin(), bdId); + tileConnectToDmaChain[currTileConnect].insert( + tileConnectToDmaChain[currTileConnect].begin(), + npuHalfDmaCpyNdOp); } - } - - tileArgIdxsBlackList.clear(); - } - return WalkResult::advance(); - }); + return WalkResult::advance(); + }); - // Only keep DMA Wait Ops if at the end of a chain, erase others - res = controlCodeOp->walk([&](Operation *op) { - if (auto npuDmaWaitOp = dyn_cast(op)) { - bool toErase = true; - for (Value token : npuDmaWaitOp.getAsyncTokens()) { - auto npuHalfDmaCpyNdOp = dyn_cast_if_present( - token.getDefiningOp()); - bool chaining = npuHalfDmaCpyNdOp && npuHalfDmaCpyNdOp.getUseNextBd(); - if (!chaining) { - toErase = false; - break; - } - } - if (toErase) { - rewriter.eraseOp(npuDmaWaitOp); - } - } - return WalkResult::advance(); - }); + // Build the remaining chains. + for (auto &[entry, _] : tileConnectToBdIds) { + updateChainOperands(rewriter, tileConnectToDmaChain[entry]); + } if (res.wasInterrupted()) return failure(); return success(); @@ -336,7 +246,8 @@ void AMDAIEInsertDmaBdChainPass::runOnOperation() { AMDAIE::getDeviceModel(maybeDevice.value()); WalkResult res = parentOp->walk([&](AMDAIE::WorkgroupOp workgroupOp) { - if (failed(insertDmaBdChain(deviceModel, workgroupOp))) { + AMDAIE::ControlCodeOp controlCodeOp = workgroupOp.getControlCode(); + if (failed(insertDmaBdChain(deviceModel, controlCodeOp))) { return WalkResult::interrupt(); } return WalkResult::advance(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIENpuDmaToHalfDmaCpyNd.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIENpuDmaToHalfDmaCpyNd.cpp index a58d836cf..4aa9c6928 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIENpuDmaToHalfDmaCpyNd.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIENpuDmaToHalfDmaCpyNd.cpp @@ -31,7 +31,7 @@ struct NpuDmaToHalfDmaCpyNdConverter final return dmaOp.emitOpError() << "should operate on an `amdaie.connection` op"; } - bool useNextBd{false}; + BoolAttr useNextBd = rewriter.getBoolAttr(false); Value nextBd{nullptr}; // Convert source half. Value source = diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index d10e20611..afd92e4cd 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -37,7 +37,6 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIECONNECTIONTOFLOW #define GEN_PASS_DEF_AMDAIECONTROLCODEFORALLTOFOR #define GEN_PASS_DEF_AMDAIECONTROLCODELOOPUNROLL -#define GEN_PASS_DEF_AMDAIECONTROLCODETOHALFDMACPYND #define GEN_PASS_DEF_AMDAIECONTROLCODELOWERING #define GEN_PASS_DEF_AMDAIECONTROLCODETOTRANSACTION #define GEN_PASS_DEF_AMDAIECONVERTCOREFORALLTOFOR diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index 87d02e260..72c075465 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -171,7 +171,6 @@ def AMDAIEControlCodeLowering : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEControlCodeLoweringPass()"; } - def AMDAIEControlCodeToTransaction : Pass<"iree-amdaie-controlcode-to-transaction", ""> { let summary = "Convert controlcode instructions into a NPU instruction transaction."; @@ -233,6 +232,7 @@ def AMDAIEDistributeL1Allocations : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEDistributeL1AllocationsPass()"; } + def AMDAIEDmaComposition : Pass<"iree-amdaie-dma-composition"> { let summary = "Compose DMA operations by DMA combination and loop subsumption."; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.cpp index 94476b085..9c2e43c6e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.cpp @@ -483,55 +483,4 @@ LogicalResult moveNpuDmaSyncUsersAfterAncestorInSameBlock( return success(); } -// Move NPU DMA wait operations with async_source tokens as late as possible -// (after the target DMA wait operation which has async_target token) This is to -// help later optimizations such as DMA BD chaining. Example: -// -// %0 = dma_cpy_nd async_source -// dma_wait(%0 : !amdaie.async_source_token) -// %1 = dma_cpy_nd async_source -// dma_wait(%1 : !amdaie.async_source_token) -// %2 = dma_cpy_nd async_target -// dma_wait(%2 : !amdaie.async_target_token) -// ------------------------------->>>>>>>>>> -// %0 = dma_cpy_nd async_source -// %1 = dma_cpy_nd async_source -// %2 = dma_cpy_nd async_target -// dma_wait(%2 : !amdaie.async_target_token) -// dma_wait(%0 : !amdaie.async_source_token) -// dma_wait(%1 : !amdaie.async_source_token) - -LogicalResult moveNpuSourceDmaSyncAfterTargetDmaCpy(RewriterBase &rewriter, - Operation *parentOp) { - // Stores NPU source DMA wait operations to be moved later. - SmallVector npuSourceDmaWaitOps; - - WalkResult res = parentOp->walk([&](Operation *op) { - if (auto npuDmaWaitOp = dyn_cast(op)) { - // Check if the DMA wait operation contains an async target token. - bool hasAsyncTargetToken = - llvm::any_of(npuDmaWaitOp.getAsyncTokens(), [](Value token) { - return isa(token.getType()); - }); - if (!hasAsyncTargetToken) { - npuSourceDmaWaitOps.push_back(npuDmaWaitOp); - } else { - // Move all collected NPU source DMA wait ops after the current target - // DMA wait op, but only if they belong to the same block. - for (auto &npuSourceDmaWaitOp : npuSourceDmaWaitOps) { - if (npuSourceDmaWaitOp->getBlock() == npuDmaWaitOp->getBlock()) { - rewriter.moveOpAfter(npuSourceDmaWaitOp, npuDmaWaitOp); - } - } - // Clear the list after moving. - npuSourceDmaWaitOps.clear(); - } - } - return WalkResult::advance(); - }); - - if (res.wasInterrupted()) return failure(); - return success(); -} - } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h index 8dc53b152..e4dbfd36b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h @@ -344,10 +344,6 @@ struct DmaDimConfig { LogicalResult moveNpuDmaSyncUsersAfterAncestorInSameBlock( RewriterBase &rewriter, Operation *parentOp); -/// Utility to move the source dma synchronization after the target dma copy. -LogicalResult moveNpuSourceDmaSyncAfterTargetDmaCpy(RewriterBase &rewriter, - Operation *parentOp); - } // namespace mlir::iree_compiler::AMDAIE #endif diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir index 4e1346ae9..63b163034 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir @@ -58,18 +58,18 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} // CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} - amdaie.npu.half_dma_cpy_nd %4(%5[] [] [] bd_id = %bd_id channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo> + amdaie.npu.half_dma_cpy_nd %4(%5[] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 2048 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} // CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} // CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) - %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo> + %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%6 : !amdaie.async_token) // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} // CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32} // CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token) - %7 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo> + %7 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%7 : !amdaie.async_token) amdaie.end } @@ -109,18 +109,18 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} // CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} - amdaie.npu.half_dma_cpy_nd %4(%5[] [] [] bd_id = %bd_id channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo> + amdaie.npu.half_dma_cpy_nd %4(%5[] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} // CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} // CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) - %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo> + %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%6 : !amdaie.async_token) // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 512 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 64 : ui32} // CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32} // CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token) - %7 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 32] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo> + %7 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 32] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%7 : !amdaie.async_token) amdaie.end } @@ -128,3 +128,53 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} return } } + +// ----- + +// CHECK-LABEL: @half_npu_dma_cpy_nd_chain +// CHECK: amdaie.controlcode +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @half_npu_dma_cpy_nd_chain() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + amdaie.workgroup { + %tile = amdaie.tile(%c0, %c1) + %tile_0 = amdaie.tile(%c0, %c0) + %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %lock = amdaie.lock(%tile(4), 4) + %lock_2 = amdaie.lock(%tile(5), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S) + %channel_3 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM) + %3 = amdaie.flow({%channel} -> {%channel_3}) {is_packet_flow = false} + %4 = amdaie.connection(%0 {%channel_3}, %2 {%channel}, flow = %3) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %1, 64 : memref<64x32xi32> + %bd_id = amdaie.bd_id(%tile_0, %c0) + %bd_id_1 = amdaie.bd_id(%tile_0, %c1) + %bd_id_2 = amdaie.bd_id(%tile_0, %c2) +// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 1 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = true, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} + amdaie.npu.half_dma_cpy_nd %4(%5[] [] [] bd_id = %bd_id channel = %channel use_next_bd = true next_bd = %bd_id_1 start_bd = %bd_id) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.write_bd {bd_id = 1 : ui32, buffer_length = 2048 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 2 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = true, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 1 : ui32, col = 0 : ui32, offset = 0 : ui32} + amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id_1 channel = %channel use_next_bd = true next_bd = %bd_id_2 start_bd = %bd_id) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.write_bd {bd_id = 2 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 2 : ui32, col = 0 : ui32, offset = 0 : ui32} +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32} +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) + %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id_2 channel = %channel use_next_bd = false start_bd = %bd_id) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%6 : !amdaie.async_token) + amdaie.end + } + } + return + } +} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_dma_bd_chain.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_dma_bd_chain.mlir index 6bf9c6161..4394718ad 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_dma_bd_chain.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_dma_bd_chain.mlir @@ -1,14 +1,15 @@ // RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-insert-dma-bd-chain)" --split-input-file --verify-diagnostics %s | FileCheck %s // CHECK-LABEL: @single_bd_chain -// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id -// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id -// CHECK: %[[CHANNEL:.+]] = amdaie.channel -// CHECK: %[[CONNECTION:.+]] = amdaie.connection -// CHECK: %[[OBJECT_FIFO:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL]] use_next_bd = true next_bd = %[[BD_ID_1]] start_bd = %[[BD_ID_0]]) -// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL]] use_next_bd = false start_bd = %[[BD_ID_0]]) -// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token) +// CHECK: %[[CHANNEL:.+]] = amdaie.channel +// CHECK: %[[CONNECTION:.+]] = amdaie.connection +// CHECK: amdaie.controlcode +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id +// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id +// CHECK: %[[OBJECT_FIFO:.+]] = amdaie.logicalobjectfifo.from_memref +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION]](%[[OBJECT_FIFO]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL]] use_next_bd = true next_bd = %[[BD_ID_1]] start_bd = %[[BD_ID_0]]) +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION]](%[[OBJECT_FIFO]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL]] use_next_bd = false start_bd = %[[BD_ID_0]]) +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> @@ -19,8 +20,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.workgroup { %tile = amdaie.tile(%c0, %c0) %tile_0 = amdaie.tile(%c0, %c1) - %bd_id = amdaie.bd_id(%tile, 0) - %bd_id_1 = amdaie.bd_id(%tile, 1) %buffer = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32> %buffer_2 = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32> %lock = amdaie.lock(%tile_0(0), 0) @@ -35,8 +34,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.controlcode { memref.assume_alignment %0, 64 : memref<512x512xbf16> %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<512x512xbf16> -> !amdaie.logicalobjectfifo> + %bd_id = amdaie.bd_id(%tile, %c0) %6 = amdaie.npu.half_dma_cpy_nd async %4(%5 [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %bd_id channel = %channel use_next_bd = false start_bd = %bd_id) : !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%6 : !amdaie.async_token) + %bd_id_1 = amdaie.bd_id(%tile, %c1) %7 = amdaie.npu.half_dma_cpy_nd async %4(%5 [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %bd_id_1 channel = %channel use_next_bd = false start_bd = %bd_id_1) : !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%7 : !amdaie.async_token) amdaie.end @@ -49,37 +50,36 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK-LABEL: @two_bd_chain -// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id -// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id -// CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id -// CHECK: %[[BD_ID_3:.+]] = amdaie.bd_id -// CHECK: %[[CHANNEL_0:.+]] = amdaie.channel -// CHECK: %[[CHANNEL_1:.+]] = amdaie.channel -// CHECK: %[[CHANNEL_2:.+]] = amdaie.channel -// CHECK: %[[CHANNEL_3:.+]] = amdaie.channel -// CHECK: %[[CONNECTION_0:.+]] = amdaie.connection -// CHECK: %[[CONNECTION_1:.+]] = amdaie.connection -// CHECK: %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK: %[[OBJECT_FIFO_1:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[OBJECT_FIFO_0]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL_0]] use_next_bd = true next_bd = %[[BD_ID_2]] start_bd = %[[BD_ID_0]]) -// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_1]](%[[OBJECT_FIFO_1]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL_2]] use_next_bd = true next_bd = %[[BD_ID_3]] start_bd = %[[BD_ID_1]]) -// CHECK: %[[TOKEN_2:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[OBJECT_FIFO_0]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_2]] channel = %[[CHANNEL_0]] use_next_bd = false start_bd = %[[BD_ID_0]]) -// CHECK: %[[TOKEN_3:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_1]](%[[OBJECT_FIFO_1]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_3]] channel = %[[CHANNEL_2]] use_next_bd = false start_bd = %[[BD_ID_1]]) -// CHECK: amdaie.npu.dma_wait(%[[TOKEN_2]] : !amdaie.async_token) -// CHECK: amdaie.npu.dma_wait(%[[TOKEN_3]] : !amdaie.async_token) +// CHECK: %[[CHANNEL_0:.+]] = amdaie.channel +// CHECK: %[[CHANNEL_1:.+]] = amdaie.channel +// CHECK: %[[CHANNEL_2:.+]] = amdaie.channel +// CHECK: %[[CHANNEL_3:.+]] = amdaie.channel +// CHECK: %[[CONNECTION_0:.+]] = amdaie.connection +// CHECK: %[[CONNECTION_1:.+]] = amdaie.connection +// CHECK: amdaie.controlcode +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id +// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id +// CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id +// CHECK: %[[BD_ID_3:.+]] = amdaie.bd_id +// CHECK: %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_memref +// CHECK: %[[OBJECT_FIFO_1:.+]] = amdaie.logicalobjectfifo.from_memref +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[OBJECT_FIFO_0]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL_0]] use_next_bd = true next_bd = %[[BD_ID_2]] start_bd = %[[BD_ID_0]]) +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_1]](%[[OBJECT_FIFO_1]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL_2]] use_next_bd = true next_bd = %[[BD_ID_3]] start_bd = %[[BD_ID_1]]) +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[OBJECT_FIFO_0]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_2]] channel = %[[CHANNEL_0]] use_next_bd = false start_bd = %[[BD_ID_0]]) +// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_1]](%[[OBJECT_FIFO_1]] [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %[[BD_ID_3]] channel = %[[CHANNEL_2]] use_next_bd = false start_bd = %[[BD_ID_1]]) +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token) #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @two_bd_chain() { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index amdaie.workgroup { %tile = amdaie.tile(%c0, %c0) %tile_0 = amdaie.tile(%c0, %c1) - %bd_id = amdaie.bd_id(%tile, 0) - %bd_id_1 = amdaie.bd_id(%tile, 1) - %bd_id_2 = amdaie.bd_id(%tile, 2) - %bd_id_3 = amdaie.bd_id(%tile, 3) %buffer = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32> %buffer_4 = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32> %buffer_5 = amdaie.buffer(%tile_0) : memref<1024xbf16, 1 : i32> @@ -106,11 +106,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} memref.assume_alignment %0, 64 : memref<512x512xbf16> %10 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<512x512xbf16> -> !amdaie.logicalobjectfifo> %11 = amdaie.logicalobjectfifo.from_memref %1, {%tile} : memref<512x512xbf16> -> !amdaie.logicalobjectfifo> + %bd_id = amdaie.bd_id(%tile, %c0) %12 = amdaie.npu.half_dma_cpy_nd async %5(%10 [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %bd_id channel = %channel use_next_bd = false start_bd = %bd_id) : !amdaie.logicalobjectfifo> + %bd_id_1 = amdaie.bd_id(%tile, %c1) %13 = amdaie.npu.half_dma_cpy_nd async %9(%11 [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %bd_id_1 channel = %channel_11 use_next_bd = false start_bd = %bd_id_1) : !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%12 : !amdaie.async_token) amdaie.npu.dma_wait(%13 : !amdaie.async_token) + %bd_id_2 = amdaie.bd_id(%tile, %c2) %14 = amdaie.npu.half_dma_cpy_nd async %5(%10 [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %bd_id_2 channel = %channel use_next_bd = false start_bd = %bd_id_2) : !amdaie.logicalobjectfifo> + %bd_id_3 = amdaie.bd_id(%tile, %c3) %15 = amdaie.npu.half_dma_cpy_nd async %9(%11 [0, 0, 0] [16, 32, 32] [32, 512, 1] bd_id = %bd_id_3 channel = %channel_11 use_next_bd = false start_bd = %bd_id_3) : !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%14 : !amdaie.async_token) amdaie.npu.dma_wait(%15 : !amdaie.async_token) @@ -120,4 +124,3 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} return } } -