diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td index 4dbf77ebb..e614580de 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td @@ -255,6 +255,47 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", CircularDmaCpyNdOp getDmaCpyNdOp() { return dyn_cast(getDma().getDefiningOp()); } + + // Return the source memref type. This is retrieved using information from + // the input DMA operation. + MemRefType getSourceMemrefType() { + return cast(getDmaCpyNdOp().getSourceType()) + .getElementType(); + } + + // Return the source memory space as an attribute. + Attribute getSourceMemorySpace() { + return getSourceMemrefType().getMemorySpace(); + } + + // Helper method to return the source memory space as an integer. If no + // memory space attribute, this indicates a global memory space and we + // return 0. Else cast the memory space attribute to an integer. + uint64_t getSourceMemorySpaceAsUInt() { + Attribute memSpace = getSourceMemorySpace(); + return memSpace ? cast(memSpace).getInt() : 0; + } + + // Return the target memref type. This is retrieved using information from + // the input DMA operation. + MemRefType getTargetMemrefType() { + return cast(getDmaCpyNdOp().getTargetType()) + .getElementType(); + } + + // Return the target memory space as an attribute. + Attribute getTargetMemorySpace() { + return getTargetMemrefType().getMemorySpace(); + } + + // Helper method to return the target memory space as an integer. If no + // memory space attribute, this indicates a global memory space and we + // return 0. Else cast the memory space attribute to an integer. + uint64_t getTargetMemorySpaceAsUInt() { + Attribute memSpace = getTargetMemorySpace(); + return memSpace ? cast(memSpace).getInt() : 0; + } + // A utility to create a new doubly strided operation from this one with a // new set of source and target offsets, sizes and strides. DoublyStridedOpInterface createDoublyStridedOp( @@ -496,7 +537,7 @@ def AMDAIE_LogicalObjectFifoFromMemrefOp // Else cast the memory space attribute to an integer. uint64_t getMemorySpaceAsUInt() { Attribute memSpace = getMemorySpace(); - return memSpace ? dyn_cast(memSpace).getInt() : 0; + return memSpace ? cast(memSpace).getInt() : 0; } // Return the source memref type. diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETargetModel.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETargetModel.h new file mode 100644 index 000000000..a50c8d0f8 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETargetModel.h @@ -0,0 +1,63 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// In absence of a complete hardware model interface, this file contains some +// constants to describe hardware-related parameters used in transformations. +// This is meant to be temporary. +// +//===----------------------------------------------------------------------===// + +#ifndef IREE_COMPILER_AMDAIE_TARGET_MODEL_H_ +#define IREE_COMPILER_AMDAIE_TARGET_MODEL_H_ + +namespace mlir::iree_compiler::AMDAIE { + +//===----------------------------------------------------------------------===// +// +// DMA iteration dimensions +// +// DMAs support multi-dimensional addressing through buffer descriptors in two +// ways: +// 1. Intra-iteration access pattern. Specified via 'strides' ('steps' in buffer +// descriptor lingo), 'sizes' ('wraps' in buffer descriptro lingo) and +// 'padding'. When a DMA executes a buffer descriptor, it will access the data +// (read/write) as specified by the intra-iteration access pattern. +// 2. Inter-iteration access pattern. Specified via an iteration 'stride', +// 'size' and 'current_iteration' ('stride' is the same as 'stepsize' and 'size' +// is the same as 'wrap' in buffer descriptor lingo). Here, 'current_iteration' +// keeps track of the current execution iteration of the buffer descriptor and +// is incremented after buffer descriptor execution. the 'stride' is the offset +// to be used for each execution of the buffer descriptor, relative to the +// previous one. When 'iteration_current' is equal to 'size', the +// 'iteration_current' is reset to zero. +// +// Although all DMAs use the same buffer descriptor format to describe the +// execution configuration, the intra-iteration and inter-dimensions are +// typically used for different purposes on different DMAs. See for example the +// usage of these constants inside the DMA loop subsumption pass. +// +//===----------------------------------------------------------------------===// + +/// Shim DMAs support 3 intra-iteration dimensions + 1 inter-iteration +/// dimension. +static const int64_t kAMDAIEShimDmaNbIntraDims = 3; +static const int64_t kAMDAIEShimDmaNbInterDims = 1; + +/// MemTile DMAs support 4 intra-iteration dimensions + 1 inter-iteration +/// dimension. +static const int64_t kAMDAIEMemTileDmaNbIntraDims = 4; +static const int64_t kAMDAIEMemTileDmaNbInterDims = 1; + +/// Core DMAs support 3 intra-iteration dimensions + 1 inter-iteration +/// dimension. +static const int64_t kAMDAIECoreDmaNbIntraDims = 3; +static const int64_t kAMDAIECoreDmaNbInterDims = 1; + +} // namespace mlir::iree_compiler::AMDAIE + +#endif // IREE_COMPILER_AMDAIE_TARGET_MODEL_H_ diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt index c669deaca..6cc4b4c05 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt @@ -15,6 +15,7 @@ iree_cc_library( "AMDAIEAttrs.h" "AMDAIEDialect.h" "AMDAIEOps.h" + "AMDAIETargetModel.h" "AMDAIETypes.h" TEXTUAL_HDRS "AMDAIEAttrs.cpp.inc" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp new file mode 100644 index 000000000..6c774b593 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp @@ -0,0 +1,433 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the transformation that subsumes a loop iteration into a +// DMA access pattern if possible. This adds an additional dimension to the +// DMA's access pattern and hoits the DMA operation out of the loop. This +// transformation is possible if: +// +// - The loop's bounds and step size are all constants. +// - The DMA is only operated on once within the loop's scope. Otherwise, +// subsumbtion of the loop iteration into the DMA can change the temporal +// behaviour of the program. +// - The DMA has additional available access pattern dimensions. This +// information is retrieved from a target hardware model. +// +//===----------------------------------------------------------------------===// + +#include "iree-amd-aie/IR/AMDAIEDialect.h" +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/IR/AMDAIETargetModel.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/IR/AffineExprVisitor.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +#define DEBUG_TYPE "iree-amdaie-dma-loop-subsumption" + +namespace mlir::iree_compiler::AMDAIE { + +namespace { + +/// Return an ancestor of 'op' in 'block', or nullptr if no such ancestor. +Operation *getAncestorInBlock(Operation *op, Block *block) { + if (!op || !block) return nullptr; + auto parent = op; + while (parent && (parent->getBlock() != block)) + parent = parent->getParentOp(); + return parent; +} + +/// Utility affine expression visitor to retrieve the stride from the +/// expression. +struct RetrieveStrideSize : public AffineExprVisitor { + std::optional stride; + void visitMulExpr(AffineBinaryOpExpr expr) { + if (auto rhsSize = dyn_cast(expr.getRHS()); + isa(expr.getLHS())) { + stride = rhsSize.getValue(); + } else if (auto lhsSize = dyn_cast(expr.getLHS()); + isa(expr.getRHS())) { + stride = lhsSize.getValue(); + } + } +}; + +/// Utility to clean up the DMA users after loop subsumption + hoisting. This +/// will hoist `amdaie.npu.dma_cpy_nd`'s users like `npu.dma_wait` as well. +LogicalResult moveUsersToHoistedDMAScope(Operation *parentOp) { + IRRewriter rewriter(parentOp->getContext()); + // Move `amdaie.npu.dma_wait` operation after the parent op in the same block + // as the input `amdaie.npu.dma_cpy_nd` operation. This parent op will + // typically be a loop out of which the DMA operation has been hoisted. Moving + // the wait operation after this loop is important to avoid a deadlock with + // whatever operations are still remaining inside the loop's scope. + WalkResult res = parentOp->walk([&](AMDAIE::NpuDmaWaitOp npuDmaWaitOp) { + Operation *dmaOp = npuDmaWaitOp.getDma().getDefiningOp(); + Operation *ancestorInSameBlock = + getAncestorInBlock(npuDmaWaitOp, dmaOp->getBlock()); + if (!ancestorInSameBlock) { + npuDmaWaitOp->emitOpError( + "doesn't have an ancestor in the same scope as the source DMA op"); + return WalkResult::interrupt(); + } + rewriter.moveOpAfter(npuDmaWaitOp, ancestorInSameBlock); + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return failure(); + return success(); +} + +class SubsumeLoopIntoDMA + : public OpInterfaceRewritePattern { + using OpInterfaceRewritePattern::OpInterfaceRewritePattern; + + /// Utility to add a loop iteration to an offsets/sizes/strides access + /// pattern. + LogicalResult addIterationToAccessPattern( + RewriterBase &rewriter, int64_t lowerBound, int64_t upperBound, + int64_t step, const DenseSet &inductionValues, + SmallVector &newOffsets, + SmallVector &newSizes, + SmallVector &newStrides) const { + SmallVector insertOffsets; + SmallVector insertSizes; + SmallVector insertStrides; + for (auto &&[i, offset] : llvm::enumerate(newOffsets)) { + Value offsetValue = getValueOrCreateConstantIndexOp( + rewriter, rewriter.getUnknownLoc(), offset); + if (inductionValues.contains(offsetValue)) { + // Initialize the offsetStride to 1. This handles the case where an + // induction variable is directly used as an offset inside a strided + // operation. + int64_t offsetStride = 1; + // If the offset value is determined by an affine expression, retrieve + // the affine expression's stride.multiplier and calculate the actual + // offset stride. + if (offsetValue.getDefiningOp() && + isa(offsetValue.getDefiningOp())) { + auto applyOp = + cast(offsetValue.getDefiningOp()); + // Retrieve the stride from the affine map using an affine expression + // visitor. This is the place where invalid maps are filtered out. + // Invalid cases will have `retriever.stride == nullopt` after + // visiting. + AffineMap affineMap = applyOp.getAffineMap(); + RetrieveStrideSize retriever; + retriever.visit(affineMap.getResult(0)); + if (!retriever.stride) return failure(); + offsetStride *= retriever.stride.value(); + } + + // Multiplying by step size handles the non-normalized case. + int64_t stride = + getConstantIntValue(newStrides[i]).value() * offsetStride * step; + + newOffsets[i] = getAsIndexOpFoldResult(rewriter.getContext(), + lowerBound * offsetStride); + insertOffsets.push_back( + getAsIndexOpFoldResult(rewriter.getContext(), 0)); + + // The step size is equal to the the number of iterations + // (ceilDiv(upperBound - lowerBound, step)) + int64_t diff = upperBound - lowerBound; + assert(diff > 0 && + "expected positive difference between upper bound and lower " + "bound"); + assert(step > 0 && "expected positive step"); + int64_t newSize = 1 + ((diff - 1) / step); + insertSizes.push_back( + getAsIndexOpFoldResult(rewriter.getContext(), newSize)); + + insertStrides.push_back( + getAsIndexOpFoldResult(rewriter.getContext(), stride)); + } + } + newOffsets.insert(newOffsets.begin(), insertOffsets.begin(), + insertOffsets.end()); + newSizes.insert(newSizes.begin(), insertSizes.begin(), insertSizes.end()); + newStrides.insert(newStrides.begin(), insertStrides.begin(), + insertStrides.end()); + return success(); + } + + /// Rewrite function for a doubly strided operation with any loop-like parent + /// operation. + LogicalResult rewriteWithLoopLikeOpParent( + AMDAIE::DoublyStridedOpInterface op, PatternRewriter &rewriter, + size_t sourceMaxNbDims, size_t targetMaxNbDims, + const SmallVector &lowerBounds, + const SmallVector &upperBounds, + const SmallVector &steps, + const SmallVector> &inductionValues, + const DenseSet &allInductionValues) const { + auto loopOp = dyn_cast(op->getParentOp()); + if (!loopOp) return failure(); + + // Initialize new access pattern offsets/sizes/strides with current values. + SmallVector newSourceOffsets = op.getSourceMixedOffsets(); + SmallVector newSourceSizes = op.getSourceMixedSizes(); + SmallVector newSourceStrides = op.getSourceMixedStrides(); + SmallVector newTargetOffsets = op.getTargetMixedOffsets(); + SmallVector newTargetSizes = op.getTargetMixedSizes(); + SmallVector newTargetStrides = op.getTargetMixedStrides(); + + // Use source/target maxNbDims to check whether there are sufficient source + // and target dimensions. Otherwise, abort. + auto verifyNbDimsNeeded = [&](const SmallVector &dynamicOffsets, + size_t nbOffsets, + size_t maxNbDims) -> LogicalResult { + size_t counter = 0; + for (Value offset : dynamicOffsets) + if (allInductionValues.contains(offset)) counter++; + if (nbOffsets + counter > maxNbDims) return failure(); + return success(); + }; + SmallVector dynamicSourceOffsets = op.getSourceOffsets(); + SmallVector dynamicTargetOffsets = op.getTargetOffsets(); + if (failed(verifyNbDimsNeeded(dynamicSourceOffsets, newSourceOffsets.size(), + sourceMaxNbDims))) + return failure(); + if (failed(verifyNbDimsNeeded(dynamicTargetOffsets, newTargetOffsets.size(), + targetMaxNbDims))) + return failure(); + + // Add the loop iterations to the DMA access patterns. + for (auto &&[lb, ub, step, iterationIvValues] : llvm::reverse( + llvm::zip(lowerBounds, upperBounds, steps, inductionValues))) { + // Add loop iteration to the access pattern on the source side. + if (failed(addIterationToAccessPattern( + rewriter, lb, ub, step, iterationIvValues, newSourceOffsets, + newSourceSizes, newSourceStrides))) { + return failure(); + } + // Add loop iteration to the access pattern on the target side. + if (failed(addIterationToAccessPattern( + rewriter, lb, ub, step, iterationIvValues, newTargetOffsets, + newTargetSizes, newTargetStrides))) { + return failure(); + } + } + + assert(newSourceOffsets.size() == newSourceSizes.size() && + "expected same number of source offsets and sizes"); + assert(newSourceOffsets.size() == newSourceStrides.size() && + "expected same number of source offsets and strides"); + assert(newTargetOffsets.size() == newTargetSizes.size() && + "expected same number of target offsets and sizes"); + assert(newTargetOffsets.size() == newTargetStrides.size() && + "expected same number of target offsets and strides"); + + // Create new doubly strided operation with the updated access pattern and + // move it before the loop. + rewriter.setInsertionPoint(loopOp); + auto newDoublyStridedOp = op.createDoublyStridedOp( + rewriter, newTargetOffsets, newTargetSizes, newTargetStrides, + newSourceOffsets, newSourceSizes, newSourceStrides); + rewriter.replaceOp(op, newDoublyStridedOp.getOperation()); + return success(); + } + + /// Main rewrite function for a doubly strided operation with a `scf.for` + /// parent operation. Only handle a loop induction variable with an + /// optional `affine.apply` user for now. + LogicalResult rewriteWithForOpParent(AMDAIE::DoublyStridedOpInterface op, + PatternRewriter &rewriter, + size_t sourceMaxNbDims, + size_t targetMaxNbDims) const { + auto forOp = dyn_cast(op->getParentOp()); + if (!forOp) return failure(); + + // Dynamic bounds or step are not supported. + std::optional lowerBound = + getConstantIntValue(forOp.getLowerBound()); + std::optional upperBound = + getConstantIntValue(forOp.getUpperBound()); + std::optional step = getConstantIntValue(forOp.getStep()); + if (!lowerBound || !upperBound || !step) return failure(); + + // Only handle loop induction variable with an optional `affine.apply` user + // for now. + Value iv = forOp.getInductionVar(); + DenseSet curIvValues = {iv}; + for (OpOperand &use : iv.getUses()) { + if (!use.getOwner()) continue; + if (auto userApplyOp = dyn_cast(use.getOwner())) { + curIvValues.insert(userApplyOp.getResult()); + } + } + if (!llvm::any_of(op->getOperands(), [&](Value operand) { + return curIvValues.contains(operand); + })) { + return failure(); + } + + SmallVector lowerBounds = {lowerBound.value()}; + SmallVector upperBounds = {upperBound.value()}; + SmallVector steps = {step.value()}; + SmallVector> inductionValues = {curIvValues}; + return rewriteWithLoopLikeOpParent( + op, rewriter, sourceMaxNbDims, targetMaxNbDims, lowerBounds, + upperBounds, steps, inductionValues, curIvValues); + } + + /// Main rewrite function for a doubly strided operation with a `scf.forall` + /// parent operation. Only handle loop induction variables with an + /// optional `affine.apply` user for now. + LogicalResult rewriteWithForallOpParent(AMDAIE::DoublyStridedOpInterface op, + PatternRewriter &rewriter, + size_t sourceMaxNbDims, + size_t targetMaxNbDims) const { + auto forallOp = dyn_cast(op->getParentOp()); + if (!forallOp) return failure(); + + // Dynamic bounds or step are not supported. + std::optional> lowerBounds = + getConstantIntValues(forallOp.getMixedLowerBound()); + std::optional> upperBounds = + getConstantIntValues(forallOp.getMixedUpperBound()); + std::optional> steps = + getConstantIntValues(forallOp.getMixedStep()); + if (!lowerBounds || !upperBounds || !steps) return failure(); + + // A set of all induction variables and optional `affine.apply` user values + // for easy verification whether any of the induction variables or + // `affine.apply` values are being used. + DenseSet allInductionValues; + // A vector of all induction varilable dependent values for each induction + // var. Includes the induction variable itself and any `affine.apply` users. + SmallVector> inductionValues; + for (Value iv : forallOp.getInductionVars()) { + DenseSet curIvValues = {iv}; + allInductionValues.insert(iv); + for (Operation *userOp : iv.getUsers()) { + if (auto userApplyOp = dyn_cast(userOp)) { + curIvValues.insert(userApplyOp.getResult()); + allInductionValues.insert(userApplyOp.getResult()); + } + } + inductionValues.push_back(curIvValues); + } + // Return early if the strided operation doesn't use any of the + // induction variable dependent values. + if (!llvm::any_of(op->getOperands(), [&](Value operand) { + return allInductionValues.contains(operand); + })) { + return failure(); + } + return rewriteWithLoopLikeOpParent(op, rewriter, sourceMaxNbDims, + targetMaxNbDims, lowerBounds.value(), + upperBounds.value(), steps.value(), + inductionValues, allInductionValues); + } + + LogicalResult matchAndRewrite(AMDAIE::DoublyStridedOpInterface op, + PatternRewriter &rewriter) const override { + // Depending on the DMA being targetted, there can be a different number of + // max dimensions supported by the hardware. Consider the different cases + // for Shim, MemTile and core DMAs: + // - Shim DMAs: As the shim DMA typically isn't synchronized with other DMAs + // (through semaphore locks), the inter-iteration access pattern is + // typically used as an additional intra-iteration access pattern, + // resulting in 4 DMA dimensions which can be used to address global + // memory data. + // - As the MemTile DMAs are typically synchronized with other DMAs for + // stream-through, double-buffering purposes, the inter-iteration can't + // typically be used in the same way as the intra-iteration dimensions. + // Therefore, for now, only the intra-iteration dimensions can be used for + // DMA access patterns. + // - Core DMAs: As the core DMAs are typically synchronized with the core + // processor for data access purposes (read/write), the inter-iteration + // can't typically be used in the same way as the intra-iteration + // dimensions. Therefore, for now, only the intra-iteration dimensions can + // be used for DMA access patterns. + size_t sourceMaxNbDims{0}; + size_t targetMaxNbDims{0}; + if (auto npuDmaOp = dyn_cast(op.getOperation())) { + uint64_t sourceMemspaceInt = npuDmaOp.getSourceMemorySpaceAsUInt(); + uint64_t targetMemspaceInt = npuDmaOp.getTargetMemorySpaceAsUInt(); + if (sourceMemspaceInt == 0) { + sourceMaxNbDims = kAMDAIEShimDmaNbIntraDims + kAMDAIEShimDmaNbInterDims; + } else if (sourceMemspaceInt == 1) { + sourceMaxNbDims = kAMDAIEMemTileDmaNbIntraDims; + } else if (sourceMemspaceInt == 2) { + sourceMaxNbDims = kAMDAIECoreDmaNbIntraDims; + } + if (targetMemspaceInt == 0) { + targetMaxNbDims = kAMDAIEShimDmaNbIntraDims + kAMDAIEShimDmaNbInterDims; + } else if (targetMemspaceInt == 1) { + targetMaxNbDims = kAMDAIEMemTileDmaNbIntraDims; + } else if (targetMemspaceInt == 2) { + targetMaxNbDims = kAMDAIECoreDmaNbIntraDims; + } + + // Check that the DMA this `amdaie.npu.dma_cpy_nd` operation is operating + // on, is not being touched within the same scope. Otherwise, the rewrite + // is not be valid in general as it would be changing the temporal usage + // of the source DMA. + Operation *parentOp = op->getParentOp(); + if (!parentOp) return failure(); + Value dma = npuDmaOp.getDma(); + for (Operation *userOp : dma.getUsers()) { + if (userOp != op.getOperation() && parentOp->isProperAncestor(userOp)) { + return failure(); + } + } + } else { + return failure(); + } + + if (isa(op->getParentOp())) { + return rewriteWithForOpParent(op, rewriter, sourceMaxNbDims, + targetMaxNbDims); + } else if (isa(op->getParentOp())) { + return rewriteWithForallOpParent(op, rewriter, sourceMaxNbDims, + targetMaxNbDims); + } else { + return failure(); + } + } +}; + +class AMDAIEDmaLoopSubsumptionPass + : public impl::AMDAIEDmaLoopSubsumptionBase { + public: + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + AMDAIEDmaLoopSubsumptionPass() = default; + AMDAIEDmaLoopSubsumptionPass(const AMDAIEDmaLoopSubsumptionPass &pass){}; + void runOnOperation() override; +}; + +void AMDAIEDmaLoopSubsumptionPass::runOnOperation() { + Operation *parentOp = getOperation(); + MLIRContext *context = &getContext(); + RewritePatternSet patterns(context); + patterns.insert(context); + if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) { + parentOp->emitOpError("failed to subsume some loops into DMA operations"); + return signalPassFailure(); + } + + if (failed(moveUsersToHoistedDMAScope(parentOp))) { + parentOp->emitOpError( + "failed to move DMA users to correct scope after loop subsumption"); + return signalPassFailure(); + } +} + +} // namespace + +std::unique_ptr createAMDAIEDmaLoopSubsumptionPass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index 189ea33b3..48015859d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -55,6 +55,7 @@ iree_cc_library( "AMDAIECreateLogicalObjectFifoLink.cpp" "AMDAIECreateReferenceToAllocation.cpp" "AMDAIEDistributeCoresAndObjectFifos.cpp" + "AMDAIEDmaLoopSubsumption.cpp" "AMDAIEDmaToCircularDma.cpp" "AMDAIEDmaUtils.cpp" "AMDAIEFuseConsumerIntoLoop.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 4dda21cca..1b1d3276f 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -34,6 +34,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIECREATEREFERENCETOALLOCATION #define GEN_PASS_DEF_AMDAIEDECOMPOSELINALGEXTPACKUNPACKTOAIR #define GEN_PASS_DEF_AMDAIEDISTRIBUTECORESANDOBJECTFIFOS +#define GEN_PASS_DEF_AMDAIEDMALOOPSUBSUMPTION #define GEN_PASS_DEF_AMDAIEDMATOCIRCULARDMA #define GEN_PASS_DEF_AMDAIEFUSECONSUMERINTOLOOP #define GEN_PASS_DEF_AMDAIEFUSEFILLINTOFORALL diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index a5536df39..615b412b5 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -101,6 +101,10 @@ std::unique_ptr createAMDAIEDecomposeLinalgExtPackUnPackToAIRPass(); /// operations and distribute the logical objectFifos. std::unique_ptr createAMDAIEDistributeCoresAndObjectFifosPass(); +/// Create a pass to subsume loop iterations into DMA operations' access +/// patterns. +std::unique_ptr createAMDAIEDmaLoopSubsumptionPass(); + /// Create a pass to convert dma operations to circular dma operations. std::unique_ptr createAMDAIEDmaToCircularDmaPass(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index fad708a6e..1c2fd2faa 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -117,6 +117,12 @@ def AMDAIEDistributeCoresAndObjectFifos : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEDistributeCoresAndObjectFifosPass()"; } +def AMDAIEDmaLoopSubsumption : + Pass<"iree-amdaie-dma-loop-subsumption"> { + let summary = "Subsume loop iterations into DMA operations' access patterns."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEDmaLoopSubsumptionPass()"; +} + def AMDAIEDmaToCircularDma : Pass<"iree-amdaie-dma-to-circular-dma"> { let summary = "Convert dma operations to circular dma operations."; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index 29e00d47b..ca0ada6b3 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -22,6 +22,7 @@ iree_lit_test_suite( "create_reference_to_allocation.mlir" "disable_vectorization.mlir" "distribute_cores_and_objectfifos.mlir" + "dma_loop_subsumption.mlir" "dma_to_circular_dma.mlir" "fuse_consumer_into_loop_scf_for.mlir" "fuse_consumer_into_loop_scf_forall.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir new file mode 100644 index 000000000..3172a272a --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir @@ -0,0 +1,807 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-amdaie-dma-loop-subsumption,canonicalize))" --split-input-file --verify-diagnostics %s | FileCheck %s + +//===----------------------------------------------------------------------===// +// Sanity checks for cases where no modification should happen. +//===----------------------------------------------------------------------===// + +// Sanity check: ensure no modification in case of no loop depedency +// CHECK-LABEL: @npu_dma_cpy_nd_without_loop_dependency +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK: scf.forall (%{{.+}}, %{{.+}}) in (2, 2) +// CHECK: scf.for %{{.+}} = %[[C0]] to %[[C6]] step %[[C1]] +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0] [1, 8, 16] [128, 16, 1], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +#map = affine_map<(d0) -> (d0 * 16)> +func.func @npu_dma_cpy_nd_without_loop_dependency(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.forall (%arg2, %arg3) in (2, 2) { + scf.for %arg4 = %c0 to %c6 step %c1 { + %1 = affine.apply #map(%arg4) + %2 = amdaie.npu.dma_cpy_nd %0([0, 0, 0] [1, 8, 16] [128, 16, 1], [] [] []) + amdaie.npu.dma_wait(%2, S2MM) + } + } + amdaie.end + } + } + return +} + +// ----- + +// Ensure no modification in case of a dynamic offset not originating from an induction variable. +// CHECK-LABEL: @dynamic_non_induction_var_offset +// CHECK-SAME: %{{.+}}: !amdaie.logicalobjectfifo>, %{{.+}}: !amdaie.logicalobjectfifo>, %[[ARG:.+]]: index +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK: scf.for %{{.+}} = %[[C0]] to %[[C6]] step %[[C1]] +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[ARG]]] [16] [1], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +func.func @dynamic_non_induction_var_offset(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: index) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.for %arg3 = %c0 to %c6 step %c1 { + %2 = amdaie.npu.dma_cpy_nd %0([%arg2] [16] [1], [] [] []) + amdaie.npu.dma_wait(%2, S2MM) + } + amdaie.end + } + } + return +} + +// ----- + +// Ensure no modification in case of a invalid affine expressions. +// CHECK: #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)> +// CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (d0 * 16 + 3)> +// CHECK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 + 3)> +// CHECK: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 * 16 + 48)> +// CHECK-LABEL: @invalid_affine_expr +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK: scf.for %[[ARG2:.+]] = %[[C1]] to %[[C6]] step %[[C2]] +// CHECK: %[[APPLY1:.+]] = affine.apply #[[$MAP]](%[[ARG2]]) +// CHECK: %[[APPLY2:.+]] = affine.apply #[[$MAP1]](%[[ARG2]]) +// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[APPLY2]]] [16] [1], [%[[APPLY1]]] [16] [1]) +// CHECK: %[[APPLY3:.+]] = affine.apply #[[$MAP2]](%[[ARG2]]) +// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[APPLY3]]] [16] [1], [] [] []) +// CHECK: %[[APPLY4:.+]] = affine.apply #[[$MAP3]](%[[ARG2]]) +// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[APPLY4]]] [16] [1], [] [] []) +#map = affine_map<(d0) -> (d0 * 16)> +#map1 = affine_map<(d0) -> (d0 * 16 + 3)> +#map2 = affine_map<(d0) -> (d0 + 3)> +#map3 = affine_map<(d0) -> ((d0 + 3) * 16)> +func.func @invalid_affine_expr(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.for %arg2 = %c1 to %c6 step %c2 { + %1 = affine.apply #map(%arg2) + %2 = affine.apply #map1(%arg2) + %3 = amdaie.npu.dma_cpy_nd %0([%2] [16] [1], [%1] [16] [1]) + %4 = affine.apply #map2(%arg2) + %5 = amdaie.npu.dma_cpy_nd %0([%4] [16] [1], [] [] []) + %6 = affine.apply #map3(%arg2) + %7 = amdaie.npu.dma_cpy_nd %0([%6] [16] [1], [] [] []) + } + amdaie.end + } + } + return +} + +// ----- + +// Ensure no modification in case of too many dimensions, i.e. 4 existing +// dimensions in the case of an `amdaie.npu.dma_cpy_nd` with target on L3. +// CHECK: #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)> +// CHECK-LABEL: @npu_dma_cpy_nd_too_many_dims_target +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK: scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]] +// CHECK: %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG2]]) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0, %[[APPLY]]] [1, 1, 8, 16] [128, 128, 16, 1], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +#map = affine_map<(d0) -> (d0 * 16)> +func.func @npu_dma_cpy_nd_too_many_dims_target(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.for %arg2 = %c0 to %c6 step %c1 { + %1 = affine.apply #map(%arg2) + %2 = amdaie.npu.dma_cpy_nd %0([0, 0, 0, %1] [1, 1, 8, 16] [128, 128, 16, 1], [] [] []) + amdaie.npu.dma_wait(%2, S2MM) + } + amdaie.end + } + } + return +} + +// ----- + +// Ensure no modification in case of too many dimensions, i.e. 4 existing +// dimensions in the case of an `amdaie.npu.dma_cpy_nd` with source on L3. +// CHECK: #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)> +// CHECK-LABEL: @npu_dma_cpy_nd_too_many_dims_source +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK: scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]] +// CHECK: %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG2]]) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0, %[[APPLY]]] [1, 1, 8, 16] [128, 128, 16, 1]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +#map = affine_map<(d0) -> (d0 * 16)> +func.func @npu_dma_cpy_nd_too_many_dims_source(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.for %arg2 = %c0 to %c6 step %c1 { + %1 = affine.apply #map(%arg2) + %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, 0, %1] [1, 1, 8, 16] [128, 128, 16, 1]) + amdaie.npu.dma_wait(%2, S2MM) + } + amdaie.end + } + } + return +} + +// ----- + +// Ensure no modification in case of too many dimensions, i.e. 4 existing +// dimensions in the case of an `amdaie.npu.dma_cpy_nd` with target on L2. +// CHECK: #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)> +// CHECK-LABEL: @npu_dma_cpy_nd_too_many_dims_target_on_l2 +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK: scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]] +// CHECK: %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG2]]) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0, %[[APPLY]]] [1, 1, 8, 16] [128, 128, 16, 1], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +#map = affine_map<(d0) -> (d0 * 16)> +func.func @npu_dma_cpy_nd_too_many_dims_target_on_l2(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.for %arg2 = %c0 to %c6 step %c1 { + %1 = affine.apply #map(%arg2) + %2 = amdaie.npu.dma_cpy_nd %0([0, 0, 0, %1] [1, 1, 8, 16] [128, 128, 16, 1], [] [] []) + amdaie.npu.dma_wait(%2, S2MM) + } + amdaie.end + } + } + return +} + +// ----- + +// Ensure no modification in case of too many dimensions, i.e. 4 existing +// dimensions in the case of an `amdaie.npu.dma_cpy_nd` with source on L2. +// CHECK: #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)> +// CHECK-LABEL: @npu_dma_cpy_nd_too_many_dims_source_on_l2 +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK: scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]] +// CHECK: %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG2]]) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0, %[[APPLY]]] [1, 1, 8, 16] [128, 128, 16, 1]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +#map = affine_map<(d0) -> (d0 * 16)> +func.func @npu_dma_cpy_nd_too_many_dims_source_on_l2(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.for %arg2 = %c0 to %c6 step %c1 { + %1 = affine.apply #map(%arg2) + %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, 0, %1] [1, 1, 8, 16] [128, 128, 16, 1]) + amdaie.npu.dma_wait(%2, S2MM) + } + amdaie.end + } + } + return +} + +// ----- + +// Ensure no modification in case of too many dimensions, i.e. 3 existing +// dimensions in the case of an `amdaie.npu.dma_cpy_nd` with target on L1. +// CHECK: #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)> +// CHECK-LABEL: @npu_dma_cpy_nd_too_many_dims_target_on_l1 +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK: scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]] +// CHECK: %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG2]]) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, %[[APPLY]]] [1, 8, 16] [128, 16, 1], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +#map = affine_map<(d0) -> (d0 * 16)> +func.func @npu_dma_cpy_nd_too_many_dims_target_on_l1(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.for %arg2 = %c0 to %c6 step %c1 { + %1 = affine.apply #map(%arg2) + %2 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] []) + amdaie.npu.dma_wait(%2, S2MM) + } + amdaie.end + } + } + return +} + +// ----- + +// Ensure no modification in case of too many dimensions, i.e. 3 existing +// dimensions in the case of an `amdaie.npu.dma_cpy_nd` with source on L1. +// CHECK: #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)> +// CHECK-LABEL: @npu_dma_cpy_nd_too_many_dims_source_on_l1 +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK: scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]] +// CHECK: %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG2]]) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, %[[APPLY]]] [1, 8, 16] [128, 16, 1]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +#map = affine_map<(d0) -> (d0 * 16)> +func.func @npu_dma_cpy_nd_too_many_dims_source_on_l1(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.for %arg2 = %c0 to %c6 step %c1 { + %1 = affine.apply #map(%arg2) + %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, %1] [1, 8, 16] [128, 16, 1]) + amdaie.npu.dma_wait(%2, S2MM) + } + amdaie.end + } + } + return +} + +// ----- + +// Ensure no modification in case of multiple npu.dma_cpy_nd users with the same source in the same scope. +// CHECK-LABEL: @for_with_multiple_npu_dma_cpy_nd_same_source +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK: scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]] +// CHECK: %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG2]]) +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, %[[APPLY]]] [1, 8, 16] [128, 16, 1], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]], S2MM) +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, %[[APPLY]]] [1, 8, 16] [128, 16, 1], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]], S2MM) +#map = affine_map<(d0) -> (d0 * 16)> +func.func @for_with_multiple_npu_dma_cpy_nd_same_source(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.for %arg2 = %c0 to %c6 step %c1 { + %1 = affine.apply #map(%arg2) + %2 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] []) + amdaie.npu.dma_wait(%2, S2MM) + %3 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] []) + amdaie.npu.dma_wait(%3, S2MM) + } + amdaie.end + } + } + return +} + +// ----- + +// Ensure no modification in case of multiple npu.dma_cpy_nd users with the same source in the same scope. +// CHECK: #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)> +// CHECK-LABEL: @forall_with_multiple_npu_dma_cpy_nd_same_source +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK: scf.forall (%[[ARG2:.+]], %[[ARG3:.+]]) in (2, 6) +// CHECK: %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG3]]) +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, %[[APPLY]]] [1, 8, 16] [128, 16, 1], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]], S2MM) +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, %[[APPLY]]] [1, 8, 16] [128, 16, 1], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]], S2MM) +#map = affine_map<(d0) -> (d0 * 16)> +func.func @forall_with_multiple_npu_dma_cpy_nd_same_source(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.forall (%arg2, %arg3) in (2, 6) { + %1 = affine.apply #map(%arg3) + %2 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] []) + amdaie.npu.dma_wait(%2, S2MM) + %3 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] []) + amdaie.npu.dma_wait(%3, S2MM) + } + amdaie.end + } + } + return +} + +//===----------------------------------------------------------------------===// +// Checks for dependencies via `affine.apply` on both source and target sides. +//===----------------------------------------------------------------------===// + +// ----- + +// Check that loop subsumption happens in case of an identity affine expression. +// CHECK-LABEL: @identity_affine_expr +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK-NOT: scf.for +// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]]] [%[[C6]], %[[C16]]] [%[[C1]], %[[C1]]], [] [] []) +#map = affine_map<(d0) -> (d0)> +func.func @identity_affine_expr(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.for %arg2 = %c0 to %c6 step %c1 { + %1 = affine.apply #map(%arg2) + %3 = amdaie.npu.dma_cpy_nd %0([%1] [16] [1], [] [] []) + } + amdaie.end + } + } + return +} + +// ----- + +// CHECK-LABEL: @for_dependency_on_target +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index +// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK-NOT: scf.for +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C1]], %[[C8]], %[[C16]]] [%[[C16]], %[[C128]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +#map = affine_map<(d0) -> (d0 * 16)> +func.func @for_dependency_on_target(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.for %arg2 = %c0 to %c6 step %c1 { + %1 = affine.apply #map(%arg2) + %2 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] []) + amdaie.npu.dma_wait(%2, S2MM) + } + amdaie.end + } + } + return +} + +// ----- + +// CHECK-LABEL: @forall_dependency_on_target +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index +// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK-NOT: scf.forall +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C1]], %[[C8]], %[[C16]]] [%[[C16]], %[[C128]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +#map = affine_map<(d0) -> (16 * d0)> +func.func @forall_dependency_on_target(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.forall (%arg2, %arg3) in (2, 6) { + %1 = affine.apply #map(%arg3) + %2 = amdaie.npu.dma_cpy_nd %0([0, 0, %1] [1, 8, 16] [128, 16, 1], [] [] []) + amdaie.npu.dma_wait(%2, S2MM) + } + amdaie.end + } + } + return +} + +// ----- + +// CHECK-LABEL: @for_dependency_on_source +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index +// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK-NOT: scf.for +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C1]], %[[C8]], %[[C16]]] [%[[C16]], %[[C128]], %[[C16]], %[[C1]]]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +#map = affine_map<(d0) -> (d0 * 16)> +func.func @for_dependency_on_source(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.for %arg2 = %c0 to %c6 step %c1 { + %1 = affine.apply #map(%arg2) + %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, %1] [1, 8, 16] [128, 16, 1]) + amdaie.npu.dma_wait(%2, S2MM) + } + amdaie.end + } + } + return +} + +// ----- + +// CHECK-LABEL: @forall_dependency_on_source +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index +// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK-NOT: scf.forall +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C1]], %[[C8]], %[[C16]]] [%[[C16]], %[[C128]], %[[C16]], %[[C1]]]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +#map = affine_map<(d0) -> (d0 * 16)> +func.func @forall_dependency_on_source(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.forall (%arg2, %arg3) in (2, 6) { + %1 = affine.apply #map(%arg3) + %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, %1] [1, 8, 16] [128, 16, 1]) + amdaie.npu.dma_wait(%2, S2MM) + } + amdaie.end + } + } + return +} + +// ----- + +// Check with multiple `affine.apply` usages in a `amdaie.npu.dma_cpy_nd` operation. +// CHECK-LABEL: @multiple_for_dependencies +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index +// CHECK-DAG: %[[C256:.+]] = arith.constant 256 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK-NOT: scf.for +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C6]], %[[C8]], %[[C16]]] [%[[C256]], %[[C16]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +#map = affine_map<(d0) -> (d0 * 16)> +func.func @multiple_for_dependencies(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.for %arg2 = %c0 to %c6 step %c1 { + %1 = affine.apply #map(%arg2) + %2 = amdaie.npu.dma_cpy_nd %0([%1, %1] [8, 16] [16, 1], [] [] []) + amdaie.npu.dma_wait(%2, S2MM) + } + amdaie.end + } + } + return +} + +// ----- + +// CHECK-LABEL: @multiple_forall_dependencies +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index +// CHECK-DAG: %[[C512:.+]] = arith.constant 512 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK-NOT: scf.forall +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C6]], %[[C8]], %[[C16]]] [%[[C16]], %[[C512]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +#map = affine_map<(d0) -> (d0 * 16)> +#map1 = affine_map<(d0) -> (d0 * 32)> +func.func @multiple_forall_dependencies(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.forall (%arg2, %arg3) in (2, 6) { + %1 = affine.apply #map(%arg2) + %2 = affine.apply #map1(%arg3) + %3 = amdaie.npu.dma_cpy_nd %0([%2, %1] [8, 16] [16, 1], [] [] []) + amdaie.npu.dma_wait(%3, S2MM) + } + amdaie.end + } + } + return +} + +// ----- + +// CHECK-LABEL: @non_normalized_for_with_affine +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index +// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK-NOT: scf.for +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C16]]] [%[[C3]], %[[C16]]] [%[[C32]], %[[C1]]], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +#map = affine_map<(d0) -> (d0 * 16)> +func.func @non_normalized_for_with_affine(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.for %arg2 = %c1 to %c6 step %c2 { + %1 = affine.apply #map(%arg2) + %2 = amdaie.npu.dma_cpy_nd %0([%1] [16] [1], [] [] []) + amdaie.npu.dma_wait(%2, S2MM) + } + amdaie.end + } + } + return +} + +// ----- + +// CHECK-LABEL: @non_normalized_forall_with_affine +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index +// CHECK-DAG: %[[C5:.+]] = arith.constant 5 : index +// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index +// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index +// CHECK-DAG: %[[C48:.+]] = arith.constant 48 : index +// CHECK-DAG: %[[C1024:.+]] = arith.constant 1024 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK-NOT: scf.forall +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C32]], %[[C32]]] [%[[C5]], %[[C4]], %[[C8]], %[[C16]]] [%[[C48]], %[[C1024]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +#map = affine_map<(d0) -> (d0 * 16)> +#map1 = affine_map<(d0) -> (d0 * 32)> +func.func @non_normalized_forall_with_affine(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.forall (%arg2, %arg3) = (2, 1) to (17, 8) step (3, 2) { + %1 = affine.apply #map(%arg2) + %2 = affine.apply #map1(%arg3) + %3 = amdaie.npu.dma_cpy_nd %0([%2, %1] [8, 16] [16, 1], [] [] []) + amdaie.npu.dma_wait(%3, S2MM) + } + amdaie.end + } + } + return +} + +//===----------------------------------------------------------------------===// +// Checks for dependencies via induction variables (no affine.apply) on both +// source and target sides. +//===----------------------------------------------------------------------===// + +// ----- + +// CHECK-LABEL: @for_with_induction_var_normalized +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK-NOT: scf.for +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]]] [%[[C6]], %[[C16]]] [%[[C1]], %[[C1]]], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +func.func @for_with_induction_var_normalized(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.for %arg2 = %c0 to %c6 step %c1 { + %2 = amdaie.npu.dma_cpy_nd %0([%arg2] [16] [1], [] [] []) + amdaie.npu.dma_wait(%2, S2MM) + } + amdaie.end + } + } + return +} + +// ----- + +// CHECK-LABEL: @for_with_induction_var_non_normalized +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK-NOT: scf.for +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C1]]] [%[[C3]], %[[C16]]] [%[[C2]], %[[C1]]], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +func.func @for_with_induction_var_non_normalized(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.for %arg2 = %c1 to %c6 step %c2 { + %2 = amdaie.npu.dma_cpy_nd %0([%arg2] [16] [1], [] [] []) + amdaie.npu.dma_wait(%2, S2MM) + } + amdaie.end + } + } + return +} + +// ----- + +// CHECK-LABEL: @forall_with_induction_var_normalized +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index +// CHECK-DAG: %[[C17:.+]] = arith.constant 17 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK-NOT: scf.forall +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C17]], %[[C8]], %[[C8]], %[[C16]]] [%[[C1]], %[[C16]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +func.func @forall_with_induction_var_normalized(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.forall (%arg2, %arg3) in (17, 8) { + %3 = amdaie.npu.dma_cpy_nd %0([%arg3, %arg2] [8, 16] [16, 1], [] [] []) + amdaie.npu.dma_wait(%3, S2MM) + } + amdaie.end + } + } + return +} + +// ----- + +// CHECK-LABEL: @forall_with_induction_var_non_normalized +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index +// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index +// CHECK-DAG: %[[C5:.+]] = arith.constant 5 : index +// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index +// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK-NOT: scf.forall +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C1]], %[[C2]]] [%[[C5]], %[[C4]], %[[C8]], %[[C16]]] [%[[C3]], %[[C32]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +func.func @forall_with_induction_var_non_normalized(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.forall (%arg2, %arg3) = (2, 1) to (17, 8) step (3, 2) { + %3 = amdaie.npu.dma_cpy_nd %0([%arg3, %arg2] [8, 16] [16, 1], [] [] []) + amdaie.npu.dma_wait(%3, S2MM) + } + amdaie.end + } + } + return +} diff --git a/tests/samples/matmul_peeled_objectfifo.mlir b/tests/samples/matmul_peeled_objectfifo.mlir index 47a567dff..b99349793 100644 --- a/tests/samples/matmul_peeled_objectfifo.mlir +++ b/tests/samples/matmul_peeled_objectfifo.mlir @@ -1,4 +1,4 @@ -// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-pack-to-dma,air-copy-to-dma,iree-amdaie-air-dma-to-amdaie-dma,iree-amdaie-insert-cores,cse,iree-amdaie-localize-logicalobjectfifo,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize,iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-access-to-acquire-release,cse,canonicalize,iree-amdaie-controlcode-loop-unroll,cse,canonicalize,iree-amdaie-create-logical-objectfifo-link,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-lower-to-aie,canonicalize)" --split-input-file %s | FileCheck %s +// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-pack-to-dma,air-copy-to-dma,iree-amdaie-air-dma-to-amdaie-dma,iree-amdaie-insert-cores,cse,iree-amdaie-localize-logicalobjectfifo,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize,iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-access-to-acquire-release,cse,canonicalize,iree-amdaie-dma-loop-subsumption,cse,canonicalize,iree-amdaie-controlcode-loop-unroll,cse,canonicalize,iree-amdaie-create-logical-objectfifo-link,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-lower-to-aie,canonicalize)" --split-input-file %s | FileCheck %s // CHECK: aie.device(npu1_4col) // CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2)