From 0a8b7dc25db3da428604f86d3ddb7b444fcd99c9 Mon Sep 17 00:00:00 2001 From: James Newling Date: Thu, 29 Aug 2024 11:44:21 -0700 Subject: [PATCH] [LowerToAIE] Refactor pass (#705) It'll be good to lower directly from amdaie objectfifo operations, to aie lock/bd operations. This PR moves in that direction by moving logic out of aie to amdaie dialects. Specifically, the lower-to-aie pass gets split up, with 2 small passes being extracted (see passes.td for descriptions) 1) iree-amdaie-canonicalize-npu-dma-cpy-nd This pass gets the zero stride dimension to the outermost dimension, and now acts on the amdaie dialect. 2) iree-amdaie-sink-into-core This pass sinks dependencies into amdaie.core operations. --- .../Transforms/AMDAIECanonicalizeDma.cpp | 2 +- .../AMDAIECanonicalizeDoublyStridedOp.cpp | 1 - .../AMDAIECanonicalizeNpuDmaCpyNd.cpp | 184 ++++++++ .../Transforms/AMDAIELowerToAIE.cpp | 410 +++++----------- .../Transforms/AMDAIESinkIntoCore.cpp | 105 +++++ .../iree-amd-aie/Transforms/CMakeLists.txt | 4 +- .../iree-amd-aie/Transforms/PassDetail.h | 4 +- .../iree-amd-aie/Transforms/Passes.cpp | 28 +- .../AMD-AIE/iree-amd-aie/Transforms/Passes.h | 7 + .../AMD-AIE/iree-amd-aie/Transforms/Passes.td | 34 +- .../Transforms/test/CMakeLists.txt | 2 + .../test/canonicalize_npu_dma_cpy_nd.mlir | 126 +++++ .../Transforms/test/lower_to_aie.mlir | 436 ++++-------------- .../Transforms/test/sink_into_core.mlir | 108 +++++ tests/samples/matmul_peeled_objectfifo.mlir | 2 +- .../samples/matmul_peeled_objectfifo_e2e.mlir | 2 +- 16 files changed, 816 insertions(+), 639 deletions(-) create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeNpuDmaCpyNd.cpp create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESinkIntoCore.cpp create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_npu_dma_cpy_nd.mlir create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/sink_into_core.mlir diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDma.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDma.cpp index d12454afa..c5e9efc30 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDma.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDma.cpp @@ -13,7 +13,7 @@ #include "mlir/Pass/Pass.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" -#define DEBUG_TYPE "iree-amdaie-pack-to-dma" +#define DEBUG_TYPE "iree-amdaie-canonicalize-dma" namespace mlir::iree_compiler::AMDAIE { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDoublyStridedOp.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDoublyStridedOp.cpp index b693e4b0d..e6b919eb0 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDoublyStridedOp.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDoublyStridedOp.cpp @@ -5,7 +5,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "iree-amd-aie/IR/AMDAIEDialect.h" -#include "iree-amd-aie/IR/AMDAIEOps.h" #include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h" #include "iree-amd-aie/Transforms/Passes.h" #include "mlir/Pass/Pass.h" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeNpuDmaCpyNd.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeNpuDmaCpyNd.cpp new file mode 100644 index 000000000..4b6a401d0 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeNpuDmaCpyNd.cpp @@ -0,0 +1,184 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +#include "iree-amd-aie/IR/AMDAIEDialect.h" +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "mlir/Dialect/Utils/StaticValueUtils.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassManager.h" + +#define DEBUG_TYPE "iree-amdaie-canonicalize-npu-dma-cpy-nd" + +namespace mlir::iree_compiler::AMDAIE { + +class AMDAIECanonicalizeNpuDmaCpyNdPass + : public impl::AMDAIECanonicalizeNpuDmaCpyNdBase< + AMDAIECanonicalizeNpuDmaCpyNdPass> { + public: + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override { + MLIRContext *context = &getContext(); + ModuleOp moduleOp = getOperation(); + IRRewriter rewriter(context); + Attribute zero = rewriter.getIndexAttr(0); + Attribute one = rewriter.getIndexAttr(1); + + WalkResult walkResult = moduleOp->walk([&](NpuDmaCpyNdOp dmaOp) { + SmallVector srcOffsets = dmaOp.getSourceMixedOffsets(); + SmallVector srcSizes = dmaOp.getSourceMixedSizes(); + SmallVector srcStrides = dmaOp.getSourceMixedStrides(); + + SmallVector tgtOffsets = dmaOp.getTargetMixedOffsets(); + SmallVector tgtSizes = dmaOp.getTargetMixedSizes(); + SmallVector tgtStrides = dmaOp.getTargetMixedStrides(); + + // The first step in canonicalization is padding the offsets/sizes/strides + // vectors to be of rank `nbDimensions`. If the rank of any of these + // vectors is greater than `nbDimensions`, then this is impossible. + bool allValidRanks = srcOffsets.size() <= nbDimensions && + srcSizes.size() <= nbDimensions && + srcStrides.size() <= nbDimensions && + tgtOffsets.size() <= nbDimensions && + tgtSizes.size() <= nbDimensions && + tgtStrides.size() <= nbDimensions; + if (!allValidRanks) { + dmaOp.emitOpError() + << " has offsets/sizes/strides attributes that are " + "larger than the target canonicalization dimension of " + << nbDimensions << "."; + return WalkResult::interrupt(); + } + + // If the source is in L3, then canonicalize the source addressing. + // 1) Pad to the correct rank + // 2) Move the zero stride (if any) to the outer-most (slowest) dim. + if (dmaOp.getSourceMemorySpaceAsUInt() == 0) { + if (!dmaOp.hasSourceAddressing()) { + dmaOp.emitOpError() + << "has source in L3, but does not have source addressing. " + "Source addressing is required to canonicalize here."; + return WalkResult::interrupt(); + } + srcOffsets = getPrepended(srcOffsets, zero); + srcSizes = getPrepended(srcSizes, one); + srcStrides = getPrepended(srcStrides, zero); + std::optional maybeSwapIndex = + verifyAndGetZeroStrideIndex(srcSizes, srcStrides, dmaOp); + if (!maybeSwapIndex.has_value()) return WalkResult::interrupt(); + uint32_t swapIndex = maybeSwapIndex.value(); + bubble(srcOffsets, swapIndex); + bubble(srcSizes, swapIndex); + bubble(srcStrides, swapIndex); + } + + if (dmaOp.getTargetMemorySpaceAsUInt() == 0) { + if (!dmaOp.hasTargetAddressing()) { + dmaOp.emitOpError() + << "has target in L3, but does not have target addressing. " + "Target addressing is required to canonicalize here."; + return WalkResult::interrupt(); + } + tgtOffsets = getPrepended(tgtOffsets, zero); + tgtSizes = getPrepended(tgtSizes, one); + tgtStrides = getPrepended(tgtStrides, zero); + std::optional maybeSwapIndex = + verifyAndGetZeroStrideIndex(tgtSizes, tgtStrides, dmaOp); + if (!maybeSwapIndex.has_value()) return WalkResult::interrupt(); + uint32_t swapIndex = maybeSwapIndex.value(); + bubble(tgtOffsets, swapIndex); + bubble(tgtSizes, swapIndex); + bubble(tgtStrides, swapIndex); + } + + rewriter.setInsertionPoint(dmaOp); + + // Replace the npu.dma_cpy_nd with the canonicalized version. + dmaOp = rewriter.replaceOpWithNewOp( + dmaOp, dmaOp.getDma(), dmaOp.getTarget(), tgtOffsets, tgtSizes, + tgtStrides, dmaOp.getTargetBdId(), dmaOp.getSource(), srcOffsets, + srcSizes, srcStrides, dmaOp.getSourceBdId()); + + return WalkResult::advance(); + }); + + if (walkResult.wasInterrupted()) return signalPassFailure(); + } + + private: + /// Repeat prepend 'def' to 'tail' to make 'tail' have nbDimensions elements. + SmallVector getPrepended(ArrayRef tail, + Attribute def) { + assert(tail.size() <= nbDimensions); + SmallVector res(nbDimensions, def); + std::copy(tail.begin(), tail.end(), + res.begin() + nbDimensions - tail.size()); + return res; + } + + static size_t getLowestIndexMaybeAboveOne(ArrayRef v) { + for (size_t i = 0; i < v.size(); i++) { + std::optional maybe = getConstantIntValue(v[i]); + if (!maybe.has_value() || maybe.value() > 1) return i; + } + return v.size(); + } + + static size_t getHighestIndexMaybeZero(ArrayRef v) { + for (size_t i = v.size(); i > 0; i--) { + std::optional maybe = getConstantIntValue(v[i - 1]); + if (!maybe.has_value() || maybe.value() == 0) return i - 1; + } + return 0; + } + + /// Get the highest index where the stride is 0. If this index is greater + /// than the lowest index where the size is greater than 1, then fail. + std::optional verifyAndGetZeroStrideIndex( + ArrayRef sizes, ArrayRef strides, + NpuDmaCpyNdOp dmaOp) { + assert(strides.size() == sizes.size() && strides.size() == nbDimensions); + + size_t firstNonUnitDim = getLowestIndexMaybeAboveOne(sizes); + size_t lastZeroStrideDim = getHighestIndexMaybeZero(strides); + + if (firstNonUnitDim < lastZeroStrideDim) { + // HW limitation. + dmaOp.emitOpError("might have stride=0 in dimension ") + << lastZeroStrideDim << ", and size>1 in dimension " + << firstNonUnitDim << ". As " << firstNonUnitDim << " < " + << lastZeroStrideDim + << ", this cannot be supported -- the zero stride cannot be moved " + "to the outer-most (slowest) dimension, as required by current " + "AIE architecture."; + return {}; + } + return lastZeroStrideDim; + } + + /// Example, for swapIndex = 2. + /// Input + /// [0 1 7 13] + /// is mutated to + /// [7 0 1 13] + static void bubble(MutableArrayRef arr, size_t swapIndex) { + if (swapIndex > 0) { + std::rotate(arr.begin(), arr.begin() + swapIndex, + arr.begin() + swapIndex + 1); + } + } +}; + +std::unique_ptr createAMDAIECanonicalizeNpuDmaCpyNdPass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp index 284b297c9..663a4a552 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#include #include #include "aie/AIEDialect.h" @@ -19,10 +20,12 @@ #include "iree-amd-aie/IR/AMDAIEOps.h" #include "iree-amd-aie/Transforms/AMDAIEUtils.h" #include "iree-amd-aie/Transforms/Passes.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/Debug.h" +#include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/IR/IRMapping.h" #include "mlir/IR/Iterators.h" -#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "mlir/Pass/PassManager.h" #define DEBUG_TYPE "iree-amdaie-lower-to-aie" @@ -59,9 +62,6 @@ void eraseOp(IRRewriter &rewriter, IRMapping &mapper, Operation *op) { // Convert amdaie.core operation to aie.core //===----------------------------------------------------------------------===// -namespace { - - /// Utility to convert vectors of `size` and `stride` into an /// `AIE::BDDimLayoutArrayAttr`. AIE::BDDimLayoutArrayAttr convertSizeStrideToBDDimLayoutArrayAttr( @@ -190,22 +190,22 @@ LogicalResult accessOpToAIE(IRRewriter &rewriter, "`aie.objectfifo.acquire` + subview operation"; } - memref::ReinterpretCastOp oldReinterpretOp; + SmallVector oldReinterpretOps; for (Operation *user : accessOp->getUsers()) { if (isa(user)) { - oldReinterpretOp = cast(user); - break; + oldReinterpretOps.push_back(cast(user)); } } - if (!oldReinterpretOp) { + if (oldReinterpretOps.empty()) { return accessOp.emitError() << "reinterpret-cast op has not been generated"; } + assert(oldReinterpretOps.size() == 1 && + "expected a single reinterpret-cast op"); + auto oldReinterpretOp = oldReinterpretOps[0]; auto type = cast(oldReinterpretOp.getResult().getType()); - MemRefType newType = MemRefType::Builder(type); - - llvm::ArrayRef sizes = newType.getShape(); + ArrayRef sizes = newType.getShape(); auto [strides, baseOffset] = getStridesAndOffset(newType); auto reinterpretOp = rewriter.create( rewriter.getUnknownLoc(), newType, subviewOp.getOutput(), baseOffset, @@ -259,7 +259,7 @@ LogicalResult acquireOpToAIE(IRRewriter &rewriter, auto subviewOp = rewriter.create( rewriter.getUnknownLoc(), elementType, objFifoAquireOp.getSubview(), - rewriter.getIntegerAttr(rewriter.getI32Type(), 0)); + /* index = */ rewriter.getIntegerAttr(rewriter.getI32Type(), 0)); // Map acquire op to new acquire + subview op. mapper.map(acquireOp.getOperation(), subviewOp.getOperation()); @@ -268,17 +268,6 @@ LogicalResult acquireOpToAIE(IRRewriter &rewriter, return success(); } -LogicalResult coreLinalgOpToAIE(IRRewriter &rewriter, linalg::LinalgOp linalgOp, - IRMapping &mapper, - SmallVector &toBeErased) { - LLVM_DEBUG(llvm::dbgs() << "Convert [linalg.LinalgOp]\n"); - OpBuilder::InsertionGuard guard(rewriter); - rewriter.setInsertionPoint(linalgOp); - rewriter.clone(*(linalgOp.getOperation()), mapper); - eraseOp(rewriter, mapper, linalgOp); - return success(); -} - LogicalResult coreMemrefExtractStridedMetadataToAIE( IRRewriter &rewriter, memref::ExtractStridedMetadataOp extractStridedMetadataOp, @@ -387,7 +376,7 @@ LogicalResult coreToAIE(IRRewriter &rewriter, AMDAIE::CoreOp coreOp, auto aieCoreOp = rewriter.create(rewriter.getUnknownLoc(), tileOp); Region &aieCoreRegion = aieCoreOp.getBody(); - auto aieCoreBlock = rewriter.createBlock(&aieCoreRegion); + Block *aieCoreBlock = rewriter.createBlock(&aieCoreRegion); auto insertIt = aieCoreBlock->begin(); auto coreBlockBegin = coreBlock->begin(); auto coreBlockEnd = coreBlock->getTerminator()->getIterator(); @@ -399,7 +388,7 @@ LogicalResult coreToAIE(IRRewriter &rewriter, AMDAIE::CoreOp coreOp, rewriter.create(rewriter.getUnknownLoc()); SmallVector toBeErased; - auto walkResult = aieCoreOp.walk([&](Operation *op) { + WalkResult walkResult = aieCoreOp.walk([&](Operation *op) { rewriter.setInsertionPoint(op); if (TypeSwitch(op) .Case([&](auto accessOp) { @@ -412,9 +401,6 @@ LogicalResult coreToAIE(IRRewriter &rewriter, AMDAIE::CoreOp coreOp, return coreReleaseOpToAIE(rewriter, releaseOp, mapper, toBeErased); }) - .Case([&](auto linalgOp) { - return coreLinalgOpToAIE(rewriter, linalgOp, mapper, toBeErased); - }) .Case( [&](auto extractStridedMetadataOp) { return coreMemrefExtractStridedMetadataToAIE( @@ -437,9 +423,7 @@ LogicalResult coreToAIE(IRRewriter &rewriter, AMDAIE::CoreOp coreOp, coreOp.emitError("could not convert to AIEDialect ops"); return failure(); } - for (auto *op : toBeErased) { - eraseOp(rewriter, mapper, op); - } + for (Operation *op : toBeErased) eraseOp(rewriter, mapper, op); mapper.map(coreOp.getResult(), aieCoreOp.getResult()); mapper.map(coreOp.getOperation(), aieCoreOp.getOperation()); @@ -460,11 +444,13 @@ LogicalResult circularDmaToAIE(IRRewriter &rewriter, int &dmaId) { LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::CircularDmaCpyNdOp]\n"); rewriter.setInsertionPointToEnd(deviceBlock); + if (!dmaOp.getSource()) return dmaOp.emitOpError() << "expected a source"; auto sourceLogicalObjFifo = dyn_cast( dmaOp.getSource().getDefiningOp()); if (!sourceLogicalObjFifo) return dmaOp.emitOpError() << "expected a logical objectFifo source"; + SmallVector newSourceTiles = llvm::map_to_vector(sourceLogicalObjFifo.getTiles(), [&](Value tile) { return mapper.lookup(tile); }); @@ -480,12 +466,13 @@ LogicalResult circularDmaToAIE(IRRewriter &rewriter, dmaOp.getTarget().getDefiningOp()); if (!targetLogicalObjFifo) return dmaOp.emitOpError() << "expected a logical objectFifo source"; + SmallVector newTargetTiles = llvm::map_to_vector(targetLogicalObjFifo.getTiles(), [&](Value tile) { return mapper.lookup(tile); }); auto symName = "obj" + std::to_string(dmaId++); - auto symAttr = rewriter.getStringAttr(symName); + StringAttr symAttr = rewriter.getStringAttr(symName); FailureOr objFifo = createObjectFifo(rewriter, dmaOp, newSourceTile, newTargetTiles, symAttr); if (failed(objFifo)) return failure(); @@ -497,175 +484,94 @@ LogicalResult circularDmaToAIE(IRRewriter &rewriter, // Convert amdaie.controlcode operation to NPU instruction func //===----------------------------------------------------------------------===// -namespace { - -/// Utility to get the static offsets, sizes and strides for -/// `AIEX::NpuDmaMemcpyNdOp` with explicit addressing. -LogicalResult getStaticDimsForExplicitAddressing( - Operation *op, const SmallVector &offsets, - const SmallVector &sizes, - const SmallVector &strides, - SmallVectorImpl &staticOffsets, - SmallVectorImpl &staticSizes, - SmallVectorImpl &staticStrides) { - if (offsets.size() > staticOffsets.size()) { - return op->emitError() << "size of `offsets` should be smaller or equal to " - "size of `staticOffsets`"; - } - if (sizes.size() > staticSizes.size()) { - return op->emitError() << "size of `sizes` should be smaller or equal to " - "size of `staticSizes`"; - } - if (strides.size() > staticStrides.size()) { - return op->emitError() << "size of `strides` should be smaller or equal to " - "size of `staticStrides`"; - } - if (getConstantIntValue(strides[strides.size() - 1]).value() != 1) { - return op->emitError() << "invalid last stride, should be 1"; - } - for (int i = 0; i < offsets.size(); ++i) - staticOffsets[staticOffsets.size() - offsets.size() + i] = - getConstantIntValue(offsets[i]).value(); - for (int i = 0; i < sizes.size(); ++i) - staticSizes[staticSizes.size() - sizes.size() + i] = - getConstantIntValue(sizes[i]).value(); - for (int i = 0; i < strides.size(); ++i) - staticStrides[staticStrides.size() - strides.size() + i] = - getConstantIntValue(strides[i]).value(); - return success(); -} - -/// Utility to move 'repeat dimension' with stride 0 and size > 1 to outermost -/// dimension as only that one can support a stride with value 0 in AIE2(+) -/// hardware. But first check that such a dimension is actually the first 'real -/// dimension' in the access pattern. -LogicalResult canonicalizeNpuStridedPatternForAIE( - SmallVectorImpl &offsets, SmallVectorImpl &sizes, - SmallVectorImpl &strides) { - bool foundNonUnitDim{false}; - for (size_t i = 0; i < offsets.size(); i++) { - if (strides[i] == 0 && sizes[i] == 1) { - continue; - } else if (strides[i] == 0) { - assert(sizes[i] > 0 && "size should be positive"); - if (foundNonUnitDim) return failure(); - foundNonUnitDim = true; - } else { - foundNonUnitDim = true; - } - } - // Either dim 0 is a 'repeat dimension' or if the repeat is on a different - // dimension, it guaranteed to be preceded by unit dimensions based on the - // former check. - for (size_t i = 1; i < offsets.size(); i++) { - if (strides[i] == 0 && sizes[i] > 1) { - strides[0] = 0; - sizes[0] = sizes[i]; - sizes[i] = 1; - } - } - return success(); -} - /// Convert the `amdaie.npu.dma_cpy_nd` operation to `aiex.npu.dma_memcpy_nd`. LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter, AMDAIE::NpuDmaCpyNdOp dmaOp, SmallVector &toBeErased, IRMapping &mapper, IRMapping &bindingsMapper) { - rewriter.setInsertionPoint(dmaOp); + AMDAIE::CircularDmaCpyNdOp dmaCpyNd = dmaOp.getDmaCpyNdOp(); + + SmallVector offsets, sizes, strides; + ArrayRef staticOffsets, staticSizes, staticStrides; + AMDAIE::BdIdOp bdIdOp; + LogicalObjectFifoFromMemrefOp logicalObjFifo; + // Convert bidirectional `amdaie.npu.dma_cpy_nd` op into two halves. if (dmaOp.getSource()) { - auto sourceLogicalObjFifo = dyn_cast( + offsets = dmaOp.getSourceOffsets(); + sizes = dmaOp.getSourceSizes(); + strides = dmaOp.getSourceStrides(); + staticOffsets = dmaOp.getSourceStaticOffsets(); + staticSizes = dmaOp.getSourceStaticSizes(); + staticStrides = dmaOp.getSourceStaticStrides(); + bdIdOp = dmaOp.getSourceBdIdOp(); + if (!bdIdOp) { + return dmaOp.emitOpError() + << "must have a source BD ID op to lower to the AIE dialect."; + } + logicalObjFifo = dyn_cast( dmaOp.getSource().getDefiningOp()); - if (!sourceLogicalObjFifo) { + if (!logicalObjFifo) { return dmaOp.emitOpError() << "expected source to be an " "`amdaie.logicalobjectfifo.from_memref`"; } - if (!dmaOp.hasSourceAddressing()) { - return dmaOp.emitOpError() - << "expected source addressing for DMA with source on L3"; - } - AMDAIE::BdIdOp bdIdOp = dmaOp.getSourceBdIdOp(); - if (!bdIdOp) - return dmaOp.emitOpError() << "expected to have a source BD ID op"; - - // DmaOp either has explicit source addressing OR the defining op of its - // source has its source on L3. - SmallVector empty; - SmallVector staticOffsets(4, 0); - SmallVector staticSizes(4, 1); - SmallVector staticStrides(4, 0); - if (failed(getStaticDimsForExplicitAddressing( - dmaOp, dmaOp.getSourceMixedOffsets(), dmaOp.getSourceMixedSizes(), - dmaOp.getSourceMixedStrides(), staticOffsets, staticSizes, - staticStrides))) { - return failure(); - } - if (failed(canonicalizeNpuStridedPatternForAIE(staticOffsets, staticSizes, - staticStrides))) { - return dmaOp.emitError() << "could not canonicalize for AIE"; - } + } - AMDAIE::CircularDmaCpyNdOp dmaCpyNd = dmaOp.getDmaCpyNdOp(); - Value memref = bindingsMapper.lookup(sourceLogicalObjFifo.getMemref()); - auto objFifo = dyn_cast( - mapper.lookup(dmaCpyNd.getOperation())); - if (!objFifo) { - return dmaOp.emitError() - << "input isn't mapped to an `aie.objectifo` operation"; + else if (dmaOp.getTarget()) { + offsets = dmaOp.getTargetOffsets(); + sizes = dmaOp.getTargetSizes(); + strides = dmaOp.getTargetStrides(); + staticOffsets = dmaOp.getTargetStaticOffsets(); + staticSizes = dmaOp.getTargetStaticSizes(); + staticStrides = dmaOp.getTargetStaticStrides(); + bdIdOp = dmaOp.getTargetBdIdOp(); + if (!bdIdOp) { + return dmaOp.emitOpError() + << "must have a target BD ID op to lower to the AIE dialect."; } - bool issueToken = dmaOp.hasDmaWaitOpUser(); - rewriter.create( - rewriter.getUnknownLoc(), SmallVector{}, 0, 0, memref, empty, - empty, empty, staticOffsets, staticSizes, staticStrides, - objFifo.getName(), bdIdOp.getValue(), issueToken); - } - if (dmaOp.getTarget()) { - auto targetLogicalObjFifo = dyn_cast( + logicalObjFifo = dyn_cast( dmaOp.getTarget().getDefiningOp()); - if (!targetLogicalObjFifo) { + if (!logicalObjFifo) { return dmaOp.emitOpError() << "expected target to be an " "`amdaie.logicalobjectfifo.from_memref`"; } - if (!dmaOp.hasTargetAddressing()) { - return dmaOp.emitOpError() - << "expected target addressing for DMA with target on L3"; - } - AMDAIE::BdIdOp bdIdOp = dmaOp.getTargetBdIdOp(); - if (!bdIdOp) - return dmaOp.emitOpError() << "expected to have a target BD ID op"; - - // DmaOp either has explicit target addressing OR the defining op of its - // source has its target on L3. - SmallVector empty; - SmallVector staticOffsets(4, 0); - SmallVector staticSizes(4, 1); - SmallVector staticStrides(4, 0); - if (failed(getStaticDimsForExplicitAddressing( - dmaOp, dmaOp.getTargetMixedOffsets(), dmaOp.getTargetMixedSizes(), - dmaOp.getTargetMixedStrides(), staticOffsets, staticSizes, - staticStrides))) { - return failure(); - } - if (failed(canonicalizeNpuStridedPatternForAIE(staticOffsets, staticSizes, - staticStrides))) { - return dmaOp.emitError() << "could not canonicalize for AIE"; - } + } - AMDAIE::CircularDmaCpyNdOp dmaCpyNd = dmaOp.getDmaCpyNdOp(); - Value memref = bindingsMapper.lookup(targetLogicalObjFifo.getMemref()); - auto objFifo = dyn_cast( - mapper.lookup(dmaCpyNd.getOperation())); - if (!objFifo) { - return dmaOp.emitError() - << "input isn't mapped to an `aie.objectifo` operation"; - } - bool issueToken = dmaOp.hasDmaWaitOpUser(); - rewriter.create( - rewriter.getUnknownLoc(), SmallVector{}, 0, 0, memref, empty, - empty, empty, staticOffsets, staticSizes, staticStrides, - objFifo.getName(), bdIdOp.getValue(), issueToken); + else { + return dmaOp.emitOpError() + << "has neither source not target memory space as L3."; } + + Value memref = bindingsMapper.lookup(logicalObjFifo.getMemref()); + + auto objFifo = + dyn_cast(mapper.lookup(dmaCpyNd.getOperation())); + + uint32_t bdId = bdIdOp.getValue(); + + if (!objFifo) { + return dmaOp.emitError() + << "input isn't mapped to an `aie.objectifo` operation"; + } + + if (!offsets.empty() || !sizes.empty() || !strides.empty()) { + // Not doing now as better to just eliminate use of aiex dialect + // altogether. + return dmaOp.emitError() + << "Expect all source offsets, sizes, and strides to be static at " + "this point. Dynamic values can be supported, just need to " + "cast from 'index' to 64-bit signless integer for " + "aiex.npu.dma_memcpy_nd."; + } + + bool issueToken = dmaOp.hasDmaWaitOpUser(); + + rewriter.setInsertionPoint(dmaOp); + rewriter.create( + dmaOp.getLoc(), SmallVector{}, 0, 0, memref, offsets, sizes, + strides, staticOffsets, staticSizes, staticStrides, objFifo.getName(), + bdId, issueToken); + toBeErased.push_back(dmaOp); return success(); } @@ -690,8 +596,8 @@ LogicalResult npuDmaWaitToAIE(IRRewriter &rewriter, AMDAIE::NpuDmaWaitOp waitOp, /// Insert the control code operations into the NPU instruction function. LogicalResult controlCodeToAie(IRRewriter &rewriter, - AMDAIE::ControlCodeOp &controlCodeOp, - xilinx::AIEX::RuntimeSequenceOp &funcOp, + AMDAIE::ControlCodeOp controlCodeOp, + xilinx::AIEX::RuntimeSequenceOp funcOp, IRMapping &mapper, IRMapping &bindingsMapper) { LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::ControlCodeOp]\n"); Block *funcBlock = &funcOp.getBody().front(); @@ -736,14 +642,10 @@ LogicalResult controlCodeToAie(IRRewriter &rewriter, return WalkResult::advance(); }); if (res.wasInterrupted()) return failure(); - for (auto *op : toBeErased) { - eraseOp(rewriter, mapper, op); - } + for (Operation *op : toBeErased) eraseOp(rewriter, mapper, op); return success(); } -} // namespace - //===----------------------------------------------------------------------===// // Convert amdaie.logicalobjectfifo.link operation to `aie.objectfifo.link` //===----------------------------------------------------------------------===// @@ -898,16 +800,19 @@ LogicalResult lowerToAIE(ModuleOp moduleOp) { if (funcOp.isPrivate()) { return WalkResult::advance(); } - // Insert AIE DeviceOp + + // Create aie.device. rewriter.setInsertionPoint(moduleBlock, moduleBlock->begin()); auto deviceOp = rewriter.create( rewriter.getUnknownLoc(), xilinx::AIE::AIEDeviceAttr::get(rewriter.getContext(), aieDevice)); - deviceOp.getRegion().emplaceBlock(); - Block *deviceBlock = &deviceOp.getRegion().front(); + Block *deviceBlock = &deviceOp.getRegion().emplaceBlock(); - // Create the signature of the NPU instruction sequence function. The HAL - // interface bindings are used to order the function parameters correctly. + // The amdaie.controlcode operation has no operands, but the + // aiex.runtime_sequence that it lowers to, does. Create the signature + // of the aiex.runtime_sequence operation that replaces the + // amdaie.controlcode. The HAL interface bindings are used to + // order the function parameters correctly. IRMapping bindingsMapper; SmallVector subspanOps; funcOp->walk([&](IREE::HAL::InterfaceBindingSubspanOp subspanOp) { @@ -918,13 +823,16 @@ LogicalResult lowerToAIE(ModuleOp moduleOp) { return a.getBinding().getZExtValue() < b.getBinding().getZExtValue(); }); rewriter.setInsertionPoint(deviceBlock, deviceBlock->begin()); + + // Create aiex.runtime_sequence inside aie.device auto npuFuncOp = rewriter.create( rewriter.getUnknownLoc(), rewriter.getStringAttr(funcOp.getSymName())); - npuFuncOp.getBody().push_back(new Block); - for (int i = 0, e = subspanOps.size(); i < e; i++) { - auto a = subspanOps[i].getResult(); - npuFuncOp.getBody().addArgument(a.getType(), a.getLoc()); - bindingsMapper.map(a, npuFuncOp.getBody().getArgument(i)); + Region &body = npuFuncOp.getBody(); + body.emplaceBlock(); + + for (auto &&a : llvm::enumerate(subspanOps)) { + body.addArgument(a.value().getType(), a.value().getLoc()); + bindingsMapper.map(a.value(), body.getArgument(a.index())); } // Walk the AIE regions ops and convert ops into pure AIEDialect ops. @@ -953,16 +861,19 @@ LogicalResult lowerToAIE(ModuleOp moduleOp) { // After walking the FuncOp, it has been converted into a DeviceOp and can // safely be erased. eraseOp(rewriter, mapper, funcOp); - return WalkResult::advance(); }); if (funcRes.wasInterrupted()) return failure(); - return success(); -} -/// Utility to erase all HAL bindings and dependent operations. -LogicalResult eraseHALBindings(ModuleOp moduleOp) { - IRRewriter rewriter(moduleOp.getContext()); + // All Ukernel related function declarations will be within aie.device, so + // delete the ones outside from the SymbolTable. + SymbolTable symbolTable(moduleOp); + moduleOp->walk([&](func::FuncOp funcOp) { + if (funcOp.isPrivate() && !funcOp->getParentOfType()) { + symbolTable.erase(funcOp); + } + }); + SmallVector opsToBeErased; moduleOp.walk([&](IREE::HAL::InterfaceBindingSubspanOp subspanOp) { opsToBeErased.push_back(subspanOp.getOperation()); @@ -980,49 +891,6 @@ LogicalResult eraseHALBindings(ModuleOp moduleOp) { return success(); } -/// Utility to move dependencies outside an operation into that operation. This -/// is for example needed for `aie.core` operations as MLIR-AIE expects all -/// dependencies, like constants, inside those core operations. -template -class MoveAllDependenciesIntoOp : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(OpTy parentOp, - PatternRewriter &rewriter) const override { - bool addedDependency = false; - parentOp->walk([&](Operation *op) { - // Skip operations of type 'OpTy'. - if (isa(op)) { - return WalkResult::advance(); - } - // Check all operands and whether their defining operations are located - // outside the parentOp. - for (Value operand : op->getOperands()) { - if (!operand || !operand.getDefiningOp()) { - continue; - } - Operation *dependencyOp = operand.getDefiningOp(); - if (isa_and_nonnull( - op->getDialect())) { - // Skip AIE dialect operations. - continue; - } else if (!dependencyOp->getParentOfType()) { - // Clone the dependency operation into the parent operation's block - // and replace all uses. - rewriter.setInsertionPointToStart(&parentOp->getRegion(0).front()); - Operation *newOp = rewriter.clone(*dependencyOp); - dependencyOp->replaceUsesWithIf(newOp, [&](OpOperand &use) { - return use.getOwner()->getParentOfType() == parentOp; - }); - addedDependency = true; - } - } - return WalkResult::advance(); - }); - return success(addedDependency); - } -}; - class AMDAIELowerToAIEPass : public impl::AMDAIELowerToAIEBase { public: @@ -1031,46 +899,12 @@ class AMDAIELowerToAIEPass xilinx::AIEX::AIEXDialect>(); } - AMDAIELowerToAIEPass() = default; - AMDAIELowerToAIEPass(const AMDAIELowerToAIEPass &pass){}; - void runOnOperation() override; -}; - -void AMDAIELowerToAIEPass::runOnOperation() { - // Main function call to convert all operations into AIE dialect operations - // inside an AIE device. - if (failed(lowerToAIE(getOperation()))) { - return signalPassFailure(); - } - LLVM_DEBUG(llvm::dbgs() << "Module after lowerToAIE: " << getOperation()); - - // Clean up the HAL bindings and it's uses as they are not needed anymore. - if (failed(eraseHALBindings(getOperation()))) { - return signalPassFailure(); - } - - // Move all dependencies, like for example constants, that are residing - // outside core operations into those core operations. This is required by - // the AIE dialect. - MLIRContext *context = &getContext(); - RewritePatternSet patterns(context); - patterns.insert>(context); - if (failed( - applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)))) { - return signalPassFailure(); + void runOnOperation() override { + // Main function call to convert all operations into AIE dialect + // operations inside an AIE device. + if (failed(lowerToAIE(getOperation()))) return signalPassFailure(); } - - // All Ukernel related function declarations will be within aie.device, so - // delete the ones outside from the SymbolTable. - SymbolTable symbolTable(getOperation()); - getOperation()->walk([&](func::FuncOp funcOp) { - if (funcOp.isPrivate() && !funcOp->getParentOfType()) { - symbolTable.erase(funcOp); - } - }); -} - -} // namespace +}; std::unique_ptr createAMDAIELowerToAIEPass() { return std::make_unique(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESinkIntoCore.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESinkIntoCore.cpp new file mode 100644 index 000000000..f6116a144 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESinkIntoCore.cpp @@ -0,0 +1,105 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "aie/AIEDialect.h" +#include "iree-amd-aie/IR/AMDAIEDialect.h" +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/SCF/Transforms/Transforms.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/Verifier.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "mlir/Transforms/LoopInvariantCodeMotionUtils.h" + +#define DEBUG_TYPE "iree-amdaie-sink-into-core" + +namespace mlir::iree_compiler::AMDAIE { + +namespace { + +bool sinkInto(AMDAIE::CoreOp coreOp, PatternRewriter &rewriter) { + // Record if any ops are sunk into the core during this iteration. + bool changed = false; + + // Collect all ops in the amdaie.core op + SmallVector opsInCore; + coreOp->walk([&](Operation *op) { + if (op == coreOp) return WalkResult::advance(); + opsInCore.push_back(op); + return WalkResult::advance(); + }); + + for (auto opInCore : opsInCore) { + for (Value operand : opInCore->getOperands()) { + if (!operand || !operand.getDefiningOp()) continue; + Operation *dependencyOp = operand.getDefiningOp(); + + // Skip if the dependency is already in the core. + if (coreOp->isAncestor(dependencyOp)) { + continue; + } + + // Ops in the amdaie dialect are probably related to data movement + // and should not be sunk into the core. This might need adjustment + // later. + if (dependencyOp->getDialect()->getNamespace() == + AMDAIE::AMDAIEDialect::getDialectNamespace()) { + continue; + } + + // Create a clone of the dependency op in the core region. + Region &r = coreOp->getRegion(0); + assert(r.getBlocks().size() == 1 && "expected single block region"); + rewriter.setInsertionPointToStart(&r.front()); + Operation *sunkOp = rewriter.clone(*dependencyOp); + + // Replace uses of the dependency op inside the core. + dependencyOp->replaceUsesWithIf(sunkOp, [&](OpOperand &use) { + return coreOp->isAncestor(use.getOwner()); + }); + changed = true; + } + } + return changed; +} + +class SinkingPattern : public OpRewritePattern { + public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(AMDAIE::CoreOp coreOp, + PatternRewriter &rewriter) const override { + return success(sinkInto(coreOp, rewriter)); + } +}; + +class AMDAIESinkIntoCorePass + : public impl::AMDAIESinkIntoCoreBase { + public: + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + void runOnOperation() override { + RewritePatternSet patterns(&getContext()); + patterns.insert(&getContext()); + if (failed(applyPatternsAndFoldGreedily(getOperation(), + std::move(patterns)))) { + signalPassFailure(); + } + } +}; + +} // namespace + +std::unique_ptr createAMDAIESinkIntoCorePass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index 938171a48..05338ea55 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -50,6 +50,7 @@ iree_cc_library( "AMDAIEAssignNpuDmaBdIds.cpp" "AMDAIEBufferizeToAllocation.cpp" "AMDAIECanonicalizeDma.cpp" + "AMDAIECanonicalizeNpuDmaCpyNd.cpp" "AMDAIECanonicalizeDoublyStridedOp.cpp" "AMDAIECombineStridedOps.cpp" "AMDAIEControlCodeLoopUnroll.cpp" @@ -70,7 +71,7 @@ iree_cc_library( "AMDAIEHoistLogicalObjFifo.cpp" "AMDAIEInsertCores.cpp" "AMDAIEInsertLoopsForVectorization.cpp" - "AMDAIELinkExecutables.cpp" + "AMDAIELinkExecutables.cpp" "AMDAIELocalizeLogicalObjectFifo.cpp" "AMDAIELowerExecutableTarget.cpp" "AMDAIELowerFuncArgs.cpp" @@ -85,6 +86,7 @@ iree_cc_library( "AMDAIEPad.cpp" "AMDAIEPeelForLoop.cpp" "AMDAIEPropagateDataLayout.cpp" + "AMDAIESinkIntoCore.cpp" "AMDAIETile.cpp" "AMDAIETileAndFuse.cpp" "AMDAIEUtils.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 5bd44c2e7..abc75e0f4 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -28,6 +28,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIEBUFFERIZETOALLOCATION #define GEN_PASS_DEF_AMDAIECANONICALIZEDMA #define GEN_PASS_DEF_AMDAIECANONICALIZEDOUBLYSTRIDEDOP +#define GEN_PASS_DEF_AMDAIECANONICALIZENPUDMACPYND #define GEN_PASS_DEF_AMDAIECLEANUP #define GEN_PASS_DEF_AMDAIECOMBINESTRIDEDOPS #define GEN_PASS_DEF_AMDAIECONTROLCODELOOPUNROLL @@ -63,11 +64,12 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIEPACKANDTRANSPOSE #define GEN_PASS_DEF_AMDAIEPACKTODMA #define GEN_PASS_DEF_AMDAIEPAD -#define GEN_PASS_DEF_AMDAIEVECTORIZATION #define GEN_PASS_DEF_AMDAIEPEELFORLOOP #define GEN_PASS_DEF_AMDAIEPROPAGATEDATALAYOUT +#define GEN_PASS_DEF_AMDAIESINKINTOCORE #define GEN_PASS_DEF_AMDAIETILE #define GEN_PASS_DEF_AMDAIETILEANDFUSE +#define GEN_PASS_DEF_AMDAIEVECTORIZATION #include "iree-amd-aie/Transforms/Passes.h.inc" } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 81c54b413..cb3d87425 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -8,12 +8,20 @@ #include "aie/Passes.h" #include "aievec/Passes.h" -#include "air/Conversion/Passes.h" -#include "air/Transform/Passes.h" +#include "air/Conversion/AIRLoweringPass.h" +#include "air/Conversion/AIRRtToNpuPass.h" +#include "air/Conversion/AIRToAIEPass.h" +#include "air/Conversion/ConvertToAIRPass.h" +#include "air/Transform/AIRDependency.h" +#include "air/Transform/AIRDependencyCanonicalize.h" +#include "air/Transform/AIRDependencyScheduleOpt.h" +#include "air/Transform/AIRDmaToChannel.h" +#include "air/Transform/AIRHerdPlacementPass.h" +#include "air/Transform/AIRMiscPasses.h" +#include "air/Transform/AffineLoopOptPass.h" #include "iree-amd-aie/IR/AMDAIEAttrs.h" #include "iree-dialects/Dialect/LinalgTransform/Passes.h" #include "iree/compiler/Codegen/Common/Passes.h" -#include "iree/compiler/Utils/PassUtils.h" #include "iree/compiler/Utils/ToolUtils.h" #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" @@ -137,6 +145,15 @@ static void addAMDAIEBufferizePasses(OpPassManager &pm) { addIREEComprehensiveBufferizePasses(pm, allocationFn, memCpyFn); } +void addAMDAIEToAIEPasses(OpPassManager &passManager) { + passManager.addPass(createAMDAIECanonicalizeNpuDmaCpyNdPass()); + passManager.addPass(createCanonicalizerPass()); + passManager.addPass(createAMDAIESinkIntoCorePass()); + passManager.addPass(createCanonicalizerPass()); + passManager.addPass(createAMDAIELowerToAIEPass()); + passManager.addPass(createCanonicalizerPass()); +} + void addPackPeelBasedPassPipeline(OpPassManager &funcPassManager, TilingConfig &tilingConfig) { // First level tiling using scf.forall @@ -619,7 +636,9 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) { passManager.addPass(createAMDAIEConvertCoreForallToForPass()); passManager.addPass(createCanonicalizerPass()); passManager.addPass(createAMDAIECoreLoopUnrollPass()); - passManager.addPass(createAMDAIELowerToAIEPass()); + + addAMDAIEToAIEPasses(passManager); + passManager.addPass(createCanonicalizerPass()); // Now lower using the AIE passes from MLIR-AIE. @@ -634,6 +653,7 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device) { passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass()); passManager.addPass(memref::createFoldMemRefAliasOpsPass()); passManager.addPass(createAMDAIEBridgeToAIRPass()); + // TODO (Erwei): Figure out a way to work with AMDAIEPackToDmaPass. if (clUseTilePipeline == TilePassPipeline::PackPeelPipeline) passManager.addPass(createAMDAIEDecomposeLinalgExtPackUnPackToAIRPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index e01890f90..59a875330 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -77,6 +77,9 @@ std::unique_ptr createAMDAIEBufferizeToAllocationPass( /// Create pass to apply canonicalization to air.dma_memcpy_nd op's. std::unique_ptr createAMDAIECanonicalizeDmaPass(); +/// Create pass to canonicalize `amdaie.npu.dma_cpy_nd` operations. +std::unique_ptr createAMDAIECanonicalizeNpuDmaCpyNdPass(); + /// Create pass to canonicalize doubly strided operations. std::unique_ptr createAMDAIECanonicalizeDoublyStridedOpPass( AMDAIECanonicalizeDoublyStridedOpOptions options = {}); @@ -176,6 +179,7 @@ std::unique_ptr> createAMDAIELoweringStrategyPass( std::unique_ptr createAMDAIELowerFuncArgsPass(); /// Create pass to lower from the AMDAIE dialect to the AIE/AIEX dialects. +void addAMDAIEToAIEPasses(OpPassManager &); std::unique_ptr createAMDAIELowerToAIEPass(); /// Create pass to lower a sequence of operation(s) to a iree_codegen.ukernel.* @@ -211,6 +215,9 @@ std::unique_ptr createAMDAIEPadPass(AMDAIEPadOptions options = {}); std::unique_ptr createAMDAIEPeelForLoopPass( AMDAIEPeelForLoopOptions options = {}); +/// Create a pass to sink all dependencies into `amdaie.core` operations. +std::unique_ptr createAMDAIESinkIntoCorePass(); + /// Create pass to tile TilingInterface operations. std::unique_ptr createAMDAIETilePass(AMDAIETileOptions options = {}); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index 2dec5f951..9f6560870 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -26,7 +26,7 @@ def AMDAIEAIRDmaToAMDAIEDma : def AMDAIEAssignLogicalObjectFifoDepth : Pass<"iree-amdaie-assign-logical-objectfifo-depth", ""> { let summary = "Assign a buffer depth of the logical objectfifos."; - let constructor = + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEAssignLogicalObjectFifoDepthPass()"; let options = [ Option<"l3BufferDepth", "l3-buffer-depth", "int64_t", /*default=*/"1", @@ -92,6 +92,24 @@ def AMDAIECanonicalizeDoublyStridedOp : ]; } + +def AMDAIECanonicalizeNpuDmaCpyNd : + Pass<"iree-amdaie-canonicalize-npu-dma-cpy-nd", "ModuleOp"> { + let summary = "Canonicalize npu.dma_cpy_nd operations."; +let description = [{ + Canonicalize the offsets/sizes/strides of npu.dma_cpy_nd operations on the L3 + side of the data movement, to make them more representative of the DMA in hardware. + This pass ensures the offsets/sizes/strides are of size `nbDimensions`, and that no + dimensions with size>1 have stride=0 except for dimension zero (outer dimension). + This is a HW constraint. +}]; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIECanonicalizeNpuDmaCpyNdPass()"; + let options = [ + Option<"nbDimensions", "nb-dimensions", "uint64_t", /*default=*/"4", + "The number of dimensions the canonicalized offsets/sizes/strides must have."> + ]; +} + def AMDAIECleanup : InterfacePass<"iree-amdaie-cleanup", "mlir::FunctionOpInterface"> { let summary = "Pass to invoke several cleanup and canonicalization patterns."; @@ -430,6 +448,20 @@ def AMDAIEPropagateDataLayout : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEPropagateDataLayoutPass()"; } +def AMDAIESinkIntoCore : + Pass<"iree-amdaie-sink-into-core", "ModuleOp"> { + let summary = "Clone constants and other ops into amdaie.cores"; + let description = [{ + The amdaie.core operation should be isolated from above for code generation. + This pass finds operations outside of cores, whose values are used inside of + cores, and creates clones of them inside of cores. Operations in the amdaie + dialect are not sunk into cores, as they are assumed to be data movement + related ops which should be kept outside of cores. + }]; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESinkIntoCorePass()"; +} + + def AMDAIETile : InterfacePass<"iree-amdaie-tile", "mlir::FunctionOpInterface"> { let summary = "Pass to tile TilingInterface operations."; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index cf96a5383..affb368c6 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -17,6 +17,7 @@ iree_lit_test_suite( "bufferize_to_allocation.mlir" "canonicalize_dma.mlir" "canonicalize_doubly_strided_op.mlir" + "canonicalize_npu_dma_cpy_nd.mlir" "combine_strided_ops.mlir" "controlcode_loop_unrolling.mlir" "convert_core_forall_to_for.mlir" @@ -57,6 +58,7 @@ iree_lit_test_suite( "pad.mlir" "peel_for_loop.mlir" "propagate_data_layout.mlir" + "sink_into_core.mlir" "tile_and_fuse_using_scf_for.mlir" "tile_and_fuse_using_scf_forall.mlir" "tile_copy_using_scf_for.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_npu_dma_cpy_nd.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_npu_dma_cpy_nd.mlir new file mode 100644 index 000000000..1c5ffc27a --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_npu_dma_cpy_nd.mlir @@ -0,0 +1,126 @@ +// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-amdaie-canonicalize-npu-dma-cpy-nd)" --verify-diagnostics %s | FileCheck %s + +module { + func.func @npu_dma_cpy_nd_with_invalid_repeat( + %arg0: index, + %arg1: !amdaie.logicalobjectfifo>, + %arg2: !amdaie.logicalobjectfifo>) { + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg1[] [] [], %arg2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + // expected-error @+1 {{'amdaie.npu.dma_cpy_nd' op might have stride=0 in dimension 2, and size>1 in dimension 1. As 1 < 2, this cannot be supported -- the zero stride cannot be moved to the outer-most (slowest) dimension, as required by current AIE architecture.}} + %1 = amdaie.npu.dma_cpy_nd %0([0, 0, 0, 32] [1, 32, 2, 32] [128, 64, 0, 1] bd_id = %arg0, [] [] []) + amdaie.end + } + } + return + } +} + +// ----- + +module { + func.func @npu_dma_cpy_nd_with_multiple_repeats( + %arg0: index, + %arg1: !amdaie.logicalobjectfifo>, + %arg2: !amdaie.logicalobjectfifo>) { + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg1[] [] [], %arg2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + // expected-error @+1 {{'amdaie.npu.dma_cpy_nd' op might have stride=0 in dimension 1, and size>1 in dimension 0. As 0 < 1, this cannot be supported -- the zero stride cannot be moved to the outer-most (slowest) dimension, as required by current AIE architecture.}} + %1 = amdaie.npu.dma_cpy_nd %0([0, 0, 0, 32] [2, 8, 2, 32] [0, 0, 64, 1] bd_id = %arg0, [] [] []) + amdaie.end + } + } + return + } +} + +// ----- + +module { + func.func @controlcode_invalid_implicit_l3_memref( + %arg0: index, + %arg1: !amdaie.logicalobjectfifo>, + %arg2: !amdaie.logicalobjectfifo>) { + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg1[] [] [], %arg2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + // expected-error @+1 {{'amdaie.npu.dma_cpy_nd' op has target in L3, but does not have target addressing. Target addressing is required to canonicalize}} + %1 = amdaie.npu.dma_cpy_nd %0([] [] [] bd_id = %arg0, [] [] []) + amdaie.end + } + } + return + } +} + +// ----- + +module { + // CHECK-LABEL: func @controlcode_rank_4_destination + func.func @controlcode_rank_4_destination( + %arg0: index, + %arg1: !amdaie.logicalobjectfifo>, + %arg2: !amdaie.logicalobjectfifo>) { + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg1[] [] [], %arg2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + // CHECK: controlcode + amdaie.controlcode { + // CHECK: amdaie.npu.dma_cpy_nd + // CHECK-SAME: [0, 0, 0, 0] [1, 1, 1, 10] [0, 0, 0, 1] + %1 = amdaie.npu.dma_cpy_nd %0([0] [10] [1] bd_id = %arg0, [] [] []) + amdaie.end + } + } + return + } +} + +// ----- + +module { + // CHECK-LABEL: func @controlcode_rank_4_source + func.func @controlcode_rank_4_source( + %arg0: index, + %arg1: !amdaie.logicalobjectfifo>, + %arg2: !amdaie.logicalobjectfifo>) { + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg2[] [] [], %arg1[] [] []) : ( + !amdaie.logicalobjectfifo>, + !amdaie.logicalobjectfifo>) + // CHECK: controlcode + amdaie.controlcode { + // CHECK: amdaie.npu.dma_cpy_nd + // CHECK-SAME: [0, 0, 0, 0] [1, 1, 1, 10] [0, 0, 0, 1] + %1 = amdaie.npu.dma_cpy_nd %0([] [] [] bd_id = %arg0, [0] [10] [1]) + amdaie.end + } + } + return + } +} + +// ----- + +module { + // CHECK-LABEL: func @stride_zero_front + func.func @stride_zero_front( + %arg0: index, + %arg1: !amdaie.logicalobjectfifo>, + %arg2: !amdaie.logicalobjectfifo>) { + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg2[] [] [], %arg1[] [] []) : ( + !amdaie.logicalobjectfifo>, + !amdaie.logicalobjectfifo>) + // CHECK: controlcode + amdaie.controlcode { + // CHECK: amdaie.npu.dma_cpy_nd + // CHECK-SAME: [3, 1, 2, 4] [10, 1, 1, 12] [0, 100, 200, 300] + %1 = amdaie.npu.dma_cpy_nd %0([] [] [] bd_id = %arg0, [1, 2, 3, 4] [1, 1, 10, 12] [100, 200, 0, 300]) + amdaie.end + } + } + return + } +} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir index 9f5b2f6aa..83c624e29 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir @@ -223,8 +223,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-DAG: func.func private @ukernel_A(memref, index) attributes {llvm.bareptr = true} // CHECK-DAG: func.func private @ukernel_B(memref, index, memref, index) attributes {llvm.bareptr = true} // CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2) +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK: aie.core(%[[TILE_0_2]]) -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[ACQUIRE:.+]] = aie.objectfifo.acquire // CHECK-SAME: Produce // CHECK: %[[ACCESS:.+]] = aie.objectfifo.subview.access %[[ACQUIRE]] @@ -295,6 +295,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- + // NOTE: Due to an AIE check that verifies whether AIE operations exist inside a // core, it's hard to create a very small minimal test. // @@ -431,7 +432,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // expected-error @+1 {{could not convert to AIEDialect ops}} amdaie.controlcode { %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> - // expected-error @+1 {{op expected to have a target BD ID op}} + // expected-error @+1 {{'amdaie.npu.dma_cpy_nd' op must have a target BD ID op to lower to the AIE dialect}} %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[%c0, %c32] [%c32, %c32] [%c64, %c1], [] [] []) : target_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%npu_dma_0, S2MM) amdaie.end @@ -443,178 +444,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -#pipeline_layout = #hal.pipeline.layout - ]> -]> -#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> -module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @npu_dma_cpy_nd_invalid_addressing() { - amdaie.workgroup { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %c32 = arith.constant 32 : index - %c64 = arith.constant 64 : index - %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x64xi32> - memref.assume_alignment %2, 64 : memref<32x64xi32> - %tile_0_0 = amdaie.tile(%c0, %c0) - %tile_0_1 = amdaie.tile(%c0, %c1) - %tile_0_2 = amdaie.tile(%c0, %c2) - %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) - %alloc_1 = memref.alloc() : memref<32x32xi32, 1> - %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> - %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> - %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () - memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> - memref.dealloc %alloc_1 : memref<32x32xi32, 1> - // expected-error @+1 {{could not convert to AIEDialect ops}} - amdaie.controlcode { - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> - // expected-error @+1 {{op expected target addressing for DMA with target on L3}} - %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[] [] [] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%npu_dma_0, S2MM) - amdaie.end - } - } - return - } -} - -// ----- - -#pipeline_layout = #hal.pipeline.layout - ]> -]> -#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> -module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @npu_dma_cpy_nd_with_invalid_repeat() { - amdaie.workgroup { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x64xi32> - memref.assume_alignment %2, 64 : memref<32x64xi32> - %tile_0_0 = amdaie.tile(%c0, %c0) - %tile_0_1 = amdaie.tile(%c0, %c1) - %tile_0_2 = amdaie.tile(%c0, %c2) - %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) - %alloc_1 = memref.alloc() : memref<32x32xi32, 1> - %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> - %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> - %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () - memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> - memref.dealloc %alloc_1 : memref<32x32xi32, 1> - // expected-error @+1 {{could not convert to AIEDialect ops}} - amdaie.controlcode { - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> - // expected-error @+1 {{could not canonicalize for AIE}} - %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[0, 0, 0, 32] [1, 32, 2, 32] [0, 64, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.end - } - } - return - } -} - -// ----- - -#pipeline_layout = #hal.pipeline.layout - ]> -]> -#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> -module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @npu_dma_cpy_nd_with_multiple_repeat() { - amdaie.workgroup { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x64xi32> - memref.assume_alignment %2, 64 : memref<32x64xi32> - %tile_0_0 = amdaie.tile(%c0, %c0) - %tile_0_1 = amdaie.tile(%c0, %c1) - %tile_0_2 = amdaie.tile(%c0, %c2) - %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) - %alloc_1 = memref.alloc() : memref<32x32xi32, 1> - %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> - %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> - %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () - memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> - memref.dealloc %alloc_1 : memref<32x32xi32, 1> - // expected-error @+1 {{could not convert to AIEDialect ops}} - amdaie.controlcode { - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> - // expected-error @+1 {{could not canonicalize for AIE}} - %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[0, 0, 0, 32] [2, 8, 2, 32] [0, 0, 64, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.end - } - } - return - } -} - -// ----- - -// CHECK: aie.device -// CHECK: aiex.runtime_sequence @npu_dma_cpy_nd_with_repeat(%[[ARG0:.+]]: memref<32x64xi32> -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 32][2, 1, 1, 32][0, 0, 0, 1]) -#pipeline_layout = #hal.pipeline.layout - ]> -]> -#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> -module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @npu_dma_cpy_nd_with_repeat() { - amdaie.workgroup { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x64xi32> - memref.assume_alignment %2, 64 : memref<32x64xi32> - %tile_0_0 = amdaie.tile(%c0, %c0) - %tile_0_1 = amdaie.tile(%c0, %c1) - %tile_0_2 = amdaie.tile(%c0, %c2) - %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) - %alloc_1 = memref.alloc() : memref<32x32xi32, 1> - %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> - %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> - %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () - memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> - memref.dealloc %alloc_1 : memref<32x32xi32, 1> - amdaie.controlcode { - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> - %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[0, 0, 32] [1, 2, 32] [0, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.end - } - } - return - } -} - -// ----- - // CHECK: aie.device // CHECK: aiex.runtime_sequence @npu_dma_cpy_nd_with_repeat_already_on_outer_dim(%[[ARG0:.+]]: memref<32x64xi32> // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 32][2, 1, 2, 32][2, 0, 16, 1]) @@ -689,50 +518,41 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-SAME: issue_token = true // CHECK-SAME: metadata = @[[OBJ2]] // CHECK-NEXT: aiex.npu.dma_wait {symbol = @[[OBJ2]]} -#pipeline_layout = #hal.pipeline.layout - ]> -]> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout]>]> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @controlcode() { + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index amdaie.workgroup { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %c32 = arith.constant 32 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c2048 = arith.constant 2048 : index - %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x64xi32> - memref.assume_alignment %2, 64 : memref<32x64xi32> - %tile_0_0 = amdaie.tile(%c0, %c0) - %tile_0_1 = amdaie.tile(%c0, %c1) - %tile_0_2 = amdaie.tile(%c0, %c2) - %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) - %alloc_1 = memref.alloc() : memref<32x32xi32, 1> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x64xi32> + memref.assume_alignment %0, 64 : memref<32x64xi32> + %tile = amdaie.tile(%c0, %c0) + %tile_0 = amdaie.tile(%c0, %c1) + %tile_1 = amdaie.tile(%c0, %c2) + %bd_id = amdaie.bd_id(%tile, 0) + %alloc = memref.alloc() : memref<32x32xi32, 1> %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> - %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> - %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_source_l3 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %placeholder[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma1 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () + %1 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo> + %2 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_0} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_1} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> + %4 = amdaie.circular_dma_cpy_nd(%2[] [] [], %3[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.circular_dma_cpy_nd(%1[] [] [], %2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %6 = amdaie.circular_dma_cpy_nd(%2[] [] [], %1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.logicalobjectfifo.link[%4] -> [%5] () memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> - memref.dealloc %alloc_1 : memref<32x32xi32, 1> + memref.dealloc %alloc : memref<32x32xi32, 1> amdaie.controlcode { - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> - %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[%c0, %c32] [%c32, %c32] [%c64, %c1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%npu_dma_0, S2MM) - %npu_dma_1 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[%c0] [%c1024] [%c1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%npu_dma_1, S2MM) - %npu_dma_2 = amdaie.npu.dma_cpy_nd %dma_source_l3([] [] [], %obj0[%c0, %c32] [%c32, %c32] [%c64, %c1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%npu_dma_2, MM2S) - %npu_dma_3 = amdaie.npu.dma_cpy_nd %dma_source_l3([] [] [], %obj0[%c0] [%c2048] [%c1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%npu_dma_3, MM2S) + %7 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %8 = amdaie.npu.dma_cpy_nd %5(%7[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id, [] [] []) : target_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%8, S2MM) + %9 = amdaie.npu.dma_cpy_nd %5(%7[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id, [] [] []) : target_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%9, S2MM) + %10 = amdaie.npu.dma_cpy_nd %6([] [] [], %7[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%10, MM2S) + %11 = amdaie.npu.dma_cpy_nd %6([] [] [], %7[0, 0, 0, 0] [1, 1, 1, 2048] [0, 0, 0, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%11, MM2S) amdaie.end } } @@ -748,7 +568,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-DAG: %[[TILE_1_0:.*]] = aie.tile(1, 0) // CHECK: aie.objectfifo @[[OBJ0:.*]](%[[TILE_0_0]], {%[[TILE_0_1]]}, 2 : i32) : !aie.objectfifo> // CHECK: aie.objectfifo @[[OBJ1:.*]](%[[TILE_1_0]], {%[[TILE_0_1]]}, 2 : i32) : !aie.objectfifo> -// CHECK: aie.objectfifo @[[OBJ2:.*]](%[[TILE_0_1]] +// CHECK: aie.objectfifo @[[OBJ2:.*]](%[[TILE_0_1]] // CHECK-SAME: {%[[TILE_1_0]]}, 2 : i32) : !aie.objectfifo> // CHECK: aiex.runtime_sequence @bf16_f32_lit_test // CHECK-SAME: (%[[LHS:.*]]: memref<32x32xbf16>, %[[RHS:.*]]: memref<32x32xbf16>, %[[OUT:.*]]: memref<32x32xf32>) { @@ -765,25 +585,13 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-SAME: %[[LHS]][0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1] // CHECK-SAME: metadata = @[[OBJ0]] // CHECK-SAME: memref<32x32xbf16> -#pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer>, - #hal.descriptor_set.binding<2, storage_buffer> - ]> -]> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, <1, storage_buffer>, <2, storage_buffer>]>]> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @bf16_f32_lit_test() { + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index amdaie.workgroup { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %c32 = arith.constant 32 : index - %c16 = arith.constant 16 : index - %c512 = arith.constant 512 : index - %c256 = arith.constant 256 : index - %c1024 = arith.constant 1024 : index %alloc = memref.alloc() : memref<2x2x16x16xf32, 1 : i32> %alloc_0 = memref.alloc() : memref<1x2x32x16xbf16, 1 : i32> %tile = amdaie.tile(%c0, %c1) @@ -794,28 +602,28 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %tile_1 = amdaie.tile(%c0, %c0) %tile_2 = amdaie.tile(%c1, %c0) %bd_id = amdaie.bd_id(%tile_1, 2) - %bd_id_2 = amdaie.bd_id(%tile_1, 1) - %bd_id_3 = amdaie.bd_id(%tile_1, 0) - %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_1} : !amdaie.logicalobjectfifo> + %bd_id_3 = amdaie.bd_id(%tile_1, 1) + %bd_id_4 = amdaie.bd_id(%tile_1, 0) + %4 = amdaie.logicalobjectfifo.placeholder{%tile_1} : !amdaie.logicalobjectfifo> memref.assume_alignment %3, 64 : memref<32x32xbf16> %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32x32xbf16> - %placeholder1 = amdaie.logicalobjectfifo.placeholder{%tile_2} : !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.placeholder{%tile_2} : !amdaie.logicalobjectfifo> memref.assume_alignment %5, 64 : memref<32x32xbf16> %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x32xf32> - %placeholder2 = amdaie.logicalobjectfifo.placeholder{%tile_2} : !amdaie.logicalobjectfifo> - %9 = amdaie.circular_dma_cpy_nd(%2[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) - %10 = amdaie.circular_dma_cpy_nd(%1[] [] [], %placeholder1[] [] []) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) - %11 = amdaie.circular_dma_cpy_nd(%placeholder2[] [] [], %0[%c0, %c0, %c0, %c0] [%c2, %c16, %c2, %c16] [%c512, %c16, %c256, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo, 2>) + %8 = amdaie.logicalobjectfifo.placeholder{%tile_2} : !amdaie.logicalobjectfifo> + %9 = amdaie.circular_dma_cpy_nd(%2[] [] [], %4[] [] []) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %10 = amdaie.circular_dma_cpy_nd(%1[] [] [], %6[] [] []) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %11 = amdaie.circular_dma_cpy_nd(%8[] [] [], %0[0, 0, 0, 0] [2, 16, 2, 16] [512, 16, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo, 2>) amdaie.controlcode { - %obj0 = amdaie.logicalobjectfifo.from_memref %3, {%tile_1} : memref<32x32xbf16> -> !amdaie.logicalobjectfifo> - %obj1 = amdaie.logicalobjectfifo.from_memref %5, {%tile_1} : memref<32x32xbf16> -> !amdaie.logicalobjectfifo> - %obj2 = amdaie.logicalobjectfifo.from_memref %7, {%tile_1} : memref<32x32xf32> -> !amdaie.logicalobjectfifo> - %12 = amdaie.npu.dma_cpy_nd %11(%obj2[%c0] [%c1024] [%c1] bd_id = %bd_id_3, [] [] []) : target_type = !amdaie.logicalobjectfifo> - %13 = amdaie.npu.dma_cpy_nd %10([] [] [], %obj1[%c0, %c1, %c2] [%c2, %c32, %c16] [%c16, %c32, %c1] bd_id = %bd_id_2) : source_type = !amdaie.logicalobjectfifo> - %14 = amdaie.npu.dma_cpy_nd %9([] [] [], %obj0[%c0] [%c1024] [%c1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%12, S2MM) - amdaie.npu.dma_wait(%13, MM2S) - amdaie.npu.dma_wait(%14, MM2S) + %12 = amdaie.logicalobjectfifo.from_memref %3, {%tile_1} : memref<32x32xbf16> -> !amdaie.logicalobjectfifo> + %13 = amdaie.logicalobjectfifo.from_memref %5, {%tile_1} : memref<32x32xbf16> -> !amdaie.logicalobjectfifo> + %14 = amdaie.logicalobjectfifo.from_memref %7, {%tile_1} : memref<32x32xf32> -> !amdaie.logicalobjectfifo> + %15 = amdaie.npu.dma_cpy_nd %11(%14[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_4, [] [] []) : target_type = !amdaie.logicalobjectfifo> + %16 = amdaie.npu.dma_cpy_nd %10([] [] [], %13[0, 0, 1, 2] [1, 2, 32, 16] [0, 16, 32, 1] bd_id = %bd_id_3) : source_type = !amdaie.logicalobjectfifo> + %17 = amdaie.npu.dma_cpy_nd %9([] [] [], %12[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%15, S2MM) + amdaie.npu.dma_wait(%16, MM2S) + amdaie.npu.dma_wait(%17, MM2S) amdaie.end } } @@ -823,52 +631,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} } } -// ----- -// Test to demonstrate invalid implicit L3 memref type that has rank greater than that -// expected for static offsets/sizes/strides. -#pipeline_layout = #hal.pipeline.layout - ]> -]> -#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> -module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @controlcode_invalid_implicit_l3_memref() { - amdaie.workgroup { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %c32 = arith.constant 32 : index - %c64 = arith.constant 64 : index - %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x16x64x128x32xi32> - memref.assume_alignment %2, 64 : memref<32x16x64x128x32xi32> - %tile_0_0 = amdaie.tile(%c0, %c0) - %tile_0_1 = amdaie.tile(%c0, %c1) - %tile_0_2 = amdaie.tile(%c0, %c2) - %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) - %alloc_1 = memref.alloc() : memref<32x32xi32, 1> - %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> - %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> - %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () - memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> - memref.dealloc %alloc_1 : memref<32x32xi32, 1> - // expected-error @+1 {{could not convert to AIEDialect ops}} - amdaie.controlcode { - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x16x64x128x32xi32> -> !amdaie.logicalobjectfifo> - // expected-error @+1 {{op expected target addressing for DMA with target on L3}} - %npu_dma_1 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[] [] [] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%npu_dma_1, S2MM) - amdaie.end - } - } - return - } -} + // ----- @@ -877,15 +641,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2) // CHECK-DAG: %[[TILE_0_1:.+]] = aie.tile(0, 1) // CHECK-DAG: %[[TILE_0_0:.+]] = aie.tile(0, 0) +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index // CHECK: aie.objectfifo @[[OBJ0:.+]](%[[TILE_0_0]], {%[[TILE_0_1]]} // CHECK-NEXT: aie.objectfifo @[[OBJ1:.+]](%[[TILE_0_1]], {%[[TILE_0_2]], %[[TILE_1_2]]} -// CHECK-NEXT: aie.objectfifo.link +// CHECK-NEXT: aie.objectfifo.link // CHECK-SAME: @[[OBJ0]] // CHECK-SAME: @[[OBJ1]] // CHECK: aie.core(%[[TILE_0_2]]) -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index // CHECK: %[[ACQUIRE_0:.+]] = aie.objectfifo.acquire @[[OBJ1]](Consume, 1) // CHECK: %[[ACCESS_0:.+]] = aie.objectfifo.subview.access %[[ACQUIRE_0]] // CHECK: %[[REINTERPRET_0:.+]] = memref.reinterpret_cast %[[ACCESS_0]] @@ -896,9 +660,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: aie.objectfifo.release // CHECK-SAME: @[[OBJ1]] // CHECK: aie.core(%[[TILE_1_2]]) -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index // CHECK: %[[ACQUIRE_1:.+]] = aie.objectfifo.acquire @[[OBJ1]](Consume, 1) // CHECK: %[[ACCESS_1:.+]] = aie.objectfifo.subview.access %[[ACQUIRE_1]] // CHECK: %[[REINTERPRET_1:.+]] = memref.reinterpret_cast %[[ACCESS_1]] @@ -919,66 +680,61 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-SAME: @[[OBJ0]] // CHECK-NEXT: aiex.npu.dma_wait // CHECK-SAME: @[[OBJ0]] -#pipeline_layout = #hal.pipeline.layout - ]> -]> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout]>]> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @large_example() { + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index amdaie.workgroup { - %c0 = arith.constant 0 : index - %c0_i32 = arith.constant 0 : i32 - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %c8 = arith.constant 8 : index - %c32 = arith.constant 32 : index - %c64 = arith.constant 64 : index - %tile_0_0 = amdaie.tile(%c0, %c0) - %tile_0_1 = amdaie.tile(%c0, %c1) - %tile_0_2 = amdaie.tile(%c0, %c2) - %tile_1_2 = amdaie.tile(%c1, %c2) - %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) + %tile = amdaie.tile(%c0, %c0) + %tile_0 = amdaie.tile(%c0, %c1) + %tile_1 = amdaie.tile(%c0, %c2) + %tile_2 = amdaie.tile(%c1, %c2) + %bd_id = amdaie.bd_id(%tile, 0) %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x64xi32> memref.assume_alignment %0, 64 : memref<32x64xi32> - %alloc_1 = memref.alloc() : memref<32x32xi32, 1> - %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> - %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2, %tile_1_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> - %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %placeholder[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma1 = amdaie.circular_dma_cpy_nd(%obj2[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.link[%dma0] -> [%dma1] () - %core_0_2 = amdaie.core(%tile_0_2, in : [%dma1], out : []) { - %1 = amdaie.logicalobjectfifo.acquire(%dma1, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo> - %2 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<1024xi32, 2> - %3 = memref.reinterpret_cast %2 to offset: [0], sizes: [4, 8, 4, 8], strides: [256, 32, 8, 1] : memref<1024xi32, 2> to memref<4x8x4x8xi32, 2> - scf.for %arg2 = %c0 to %c8 step %c1 { - linalg.fill ins(%c0_i32 : i32) outs(%3 : memref<4x8x4x8xi32, 2>) + %alloc = memref.alloc() : memref<32x32xi32, 1> + %alloc_3 = memref.alloc() : memref<4x8x4x8xi32, 2> + %1 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo> + %2 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_0} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_1, %tile_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> + %4 = amdaie.circular_dma_cpy_nd(%2[] [] [], %1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.circular_dma_cpy_nd(%3[] [] [], %2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.logicalobjectfifo.link[%4] -> [%5] () + %6 = amdaie.core(%tile_1, in : [%5], out : []) { + %8 = amdaie.logicalobjectfifo.acquire(%5, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo> + %9 = amdaie.logicalobjectfifo.access(%8, Read) : !amdaie.logicalobjectfifo> -> memref<1024xi32, 2> + %reinterpret_cast = memref.reinterpret_cast %9 to offset: [0], sizes: [4, 8, 4, 8], strides: [256, 32, 8, 1] : memref<1024xi32, 2> to memref<4x8x4x8xi32, 2> + scf.for %arg0 = %c0 to %c8 step %c1 { + linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast : memref<4x8x4x8xi32, 2>) } - amdaie.logicalobjectfifo.release(%dma1, Consume) {size = 1 : i32} + amdaie.logicalobjectfifo.release(%5, Consume) {size = 1 : i32} amdaie.end } - %core_1_2 = amdaie.core(%tile_1_2, in : [%dma1], out : []) { - %1 = amdaie.logicalobjectfifo.acquire(%dma1, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo> - %2 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<1024xi32, 2> - %3 = memref.reinterpret_cast %2 to offset: [0], sizes: [4, 8, 4, 8], strides: [256, 32, 8, 1] : memref<1024xi32, 2> to memref<4x8x4x8xi32, 2> - scf.for %arg2 = %c0 to %c8 step %c1 { - linalg.fill ins(%c0_i32 : i32) outs(%3: memref<4x8x4x8xi32, 2>) + %7 = amdaie.core(%tile_2, in : [%5], out : []) { + %8 = amdaie.logicalobjectfifo.acquire(%5, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo> + %9 = amdaie.logicalobjectfifo.access(%8, Read) : !amdaie.logicalobjectfifo> -> memref<1024xi32, 2> + %reinterpret_cast = memref.reinterpret_cast %9 to offset: [0], sizes: [4, 8, 4, 8], strides: [256, 32, 8, 1] : memref<1024xi32, 2> to memref<4x8x4x8xi32, 2> + scf.for %arg0 = %c0 to %c8 step %c1 { + linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast : memref<4x8x4x8xi32, 2>) } - amdaie.logicalobjectfifo.release(%dma1, Consume) {size = 1 : i32} + amdaie.logicalobjectfifo.release(%5, Consume) {size = 1 : i32} amdaie.end } - memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> - memref.dealloc %alloc_1 : memref<32x32xi32, 1> + memref.dealloc %alloc_3 : memref<4x8x4x8xi32, 2> + memref.dealloc %alloc : memref<32x32xi32, 1> amdaie.controlcode { - %obj0 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> - %npu_dma = amdaie.npu.dma_cpy_nd %dma0([] [] [], %obj0[%c0, %c32] [%c32, %c32] [%c64, %c1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%npu_dma, MM2S) + %8 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %9 = amdaie.npu.dma_cpy_nd %4([] [] [], %8[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%9, MM2S) amdaie.end } } return } } + diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/sink_into_core.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/sink_into_core.mlir new file mode 100644 index 000000000..f10ff8444 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/sink_into_core.mlir @@ -0,0 +1,108 @@ +// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-amdaie-sink-into-core)" %s | FileCheck %s + +module { + // CHECK-LABEL: func @sink_into_single_core + func.func @sink_into_single_core(%arg0: index) { + // CHECK-NOT: arith.constant 3 : index + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %0 = arith.addi %arg0, %c3 : index + %tile = amdaie.tile(%c0, %c2) + // CHECK: amdaie.core + %1 = amdaie.core(%tile, in : [], out : []) { + // CHECK: arith.constant 3 : index + // CHECK: arith.addi + // CHECK: linalg.fill + %alloc = memref.alloc() : memref<2x2xindex> + linalg.fill ins(%0 : index) outs(%alloc : memref<2x2xindex>) + amdaie.end + } + return + } +} + +// ----- + +module { + // Constants 0 and 1 are cloned into the cores, but not removed, because + // they are still used outside of the cores. Constants 2 and 3 are used only + // inside the cores, so they are cloned into the cores but then removed from + // the outer function. + // CHECK-LABEL: func @sink_into_pair_of_cores + func.func @sink_into_pair_of_cores(%arg0 : index) { + // CHECK-NOT: arith.constant 3 : index + // CHECK-NOT: arith.constant 2 : index + // CHECK-DAG: arith.constant 1 : index + // CHECK-DAG: arith.constant 0 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %tile = amdaie.tile(%c0, %c0) + %tile_0 = amdaie.tile(%c0, %c1) + // CHECK: amdaie.core + %0 = amdaie.core(%tile, in : [], out : []) { + // CHECK-DAG: arith.constant 3 : index + // CHECK-DAG: arith.constant 2 : index + // CHECK-DAG: arith.constant 1 : index + %1 = arith.addi %arg0, %c1 : index + %2 = arith.addi %c1, %1 : index + %3 = arith.addi %2, %c2 : index + %4 = arith.addi %3, %c3 : index + %alloc = memref.alloc() : memref<2x2xindex> + linalg.fill ins(%4 : index) outs(%alloc : memref<2x2xindex>) + amdaie.end + } + // CHECK: amdaie.core + %1 = amdaie.core(%tile_0, in : [], out : []) { + // CHECK-DAG: arith.constant 3 : index + // CHECK-DAG: arith.constant 2 : index + // CHECK-DAG: arith.constant 1 : index + %1 = arith.addi %arg0, %c1 : index + %2 = arith.addi %c1, %1 : index + %3 = arith.addi %2, %c2 : index + %4 = arith.addi %3, %c3 : index + %alloc = memref.alloc() : memref<2x2xindex> + linalg.fill ins(%4 : index) outs(%alloc : memref<2x2xindex>) + amdaie.end + } + return + } +} + +// ----- + +module { + // CHECK-LABEL: dont_sink_amdaie_ops + // The 2 tiles, 2 logicalobjectfifos, and 1 dma_cpy_nd: + // CHECK-COUNT-5: amdaie + // CHECK: amdaie.core + // The logicalobjectfifo.access: + // CHECK-COUNT-1: amdaie + // CHECK: amdaie.end + func.func @dont_sink_amdaie_ops() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %alloc = memref.alloc() : memref<32x1024xi32, 1> + %alloc_0 = memref.alloc() : memref<32x64xi32, 2> + scf.forall (%arg0, %arg1) in (1, 1) { + %tile = amdaie.tile(%c0, %c1) + %tile_1 = amdaie.tile(%c0, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_1} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo> + %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[0, 0] [0, 0] [0, 0]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %3 = amdaie.core(%tile_1, in : [%2], out : []) { + %c0_i32 = arith.constant 0 : i32 + %4 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<32x64xi32, 2> + linalg.fill ins(%c0_i32 : i32) outs(%4 : memref<32x64xi32, 2>) + amdaie.end + } + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc_0 : memref<32x64xi32, 2> + memref.dealloc %alloc : memref<32x1024xi32, 1> + return + } +} + diff --git a/tests/samples/matmul_peeled_objectfifo.mlir b/tests/samples/matmul_peeled_objectfifo.mlir index 013bc863d..c0a3dbf2a 100644 --- a/tests/samples/matmul_peeled_objectfifo.mlir +++ b/tests/samples/matmul_peeled_objectfifo.mlir @@ -1,4 +1,4 @@ -// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-pack-to-dma,air-copy-to-dma,iree-amdaie-air-dma-to-amdaie-dma,iree-amdaie-insert-cores,cse,iree-amdaie-localize-logicalobjectfifo,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize,iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-access-to-acquire-release,cse,canonicalize,iree-amdaie-dma-loop-subsumption,cse,canonicalize,iree-amdaie-assign-npu-dma-bd-ids,iree-amdaie-controlcode-loop-unroll,cse,canonicalize,iree-amdaie-create-logical-objectfifo-link,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-lower-to-aie,canonicalize)" --split-input-file %s | FileCheck %s +// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-pack-to-dma,air-copy-to-dma,iree-amdaie-air-dma-to-amdaie-dma,iree-amdaie-insert-cores,cse,iree-amdaie-localize-logicalobjectfifo,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize,iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-access-to-acquire-release,cse,canonicalize,iree-amdaie-dma-loop-subsumption,cse,canonicalize,iree-amdaie-assign-npu-dma-bd-ids,iree-amdaie-controlcode-loop-unroll,cse,canonicalize,iree-amdaie-create-logical-objectfifo-link,iree-amdaie-canonicalize-doubly-strided-op,canonicalize,iree-amdaie-canonicalize-npu-dma-cpy-nd,canonicalize,iree-amdaie-sink-into-core,canonicalize,iree-amdaie-lower-to-aie,canonicalize)" --split-input-file %s | FileCheck %s // CHECK: aie.device(npu1_4col) // CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2) diff --git a/tests/samples/matmul_peeled_objectfifo_e2e.mlir b/tests/samples/matmul_peeled_objectfifo_e2e.mlir index 484494045..9229da0c3 100644 --- a/tests/samples/matmul_peeled_objectfifo_e2e.mlir +++ b/tests/samples/matmul_peeled_objectfifo_e2e.mlir @@ -21,7 +21,7 @@ // CHECK-DAG: aie.mem(%[[TILE_1_2]]) // CHECK-DAG: aie.mem(%[[TILE_1_3]]) // CHECK-DAG: aie.shim_dma_allocation {{.*}}(S2MM, 0, 0) -// CHECK: {npu_instructions = +// CHECK: {npu_instructions = // CHECK-SAME: runtime_sequence_name = "matmul_i32_dispatch_0_matmul_128x128x256_i32" func.func @matmul_i32(%lhs: tensor<128x256xi32>, %rhs: tensor<256x128xi32>) -> tensor<128x128xi32> {