From 3a6f183c6213ca8b8b3655f3297c654306f62bdc Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Mon, 2 Sep 2024 19:21:01 +0530 Subject: [PATCH] [ObjectFifo] Create a new pass to split logical objectFifos (#659) -- This commit introduces a new pass `--iree-amdaie-split-logical-objectfifos-for-connection-reuse` to split logical objectFifos for dealing with Matmul+Elementwise. -- Also contains a utility to check whether splitting can be performed. -- It addresses sub-action 2 as well from https://github.com/nod-ai/iree-amd-aie/issues/644 Signed-off-by: Abhishek Varma --- .../AMDAIELogicalObjFifoSplittingUtils.cpp | 430 +++++ .../AMDAIELogicalObjFifoSplittingUtils.h | 23 + ...SplitLogicalObjFifosForConnectionReuse.cpp | 71 + .../iree-amd-aie/Transforms/CMakeLists.txt | 2 + .../iree-amd-aie/Transforms/PassDetail.h | 1 + .../iree-amd-aie/Transforms/Passes.cpp | 1 + .../AMD-AIE/iree-amd-aie/Transforms/Passes.h | 3 + .../AMD-AIE/iree-amd-aie/Transforms/Passes.td | 6 + .../Transforms/test/CMakeLists.txt | 1 + ..._logicalobjfifos_for_connection_reuse.mlir | 1398 +++++++++++++++++ 10 files changed, 1936 insertions(+) create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos_for_connection_reuse.mlir diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp new file mode 100644 index 000000000..d09a9e746 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp @@ -0,0 +1,430 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "AMDAIELogicalObjFifoSplittingUtils.h" + +#include + +#include "llvm/ADT/DenseMap.h" +#include "llvm/Support/Debug.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Iterators.h" +#include "mlir/IR/Operation.h" + +#define DEBUG_TYPE "iree-amdaie-logicalobjfifo-splitting-utils" + +namespace mlir::iree_compiler::AMDAIE { + +/// Utility to verify that the split dimensions for L2 are contiguous. +static LogicalResult checkIsRangeFromZero( + SmallVector &splitDimsSetForL2) { + for (auto &&[dim, splitDim] : llvm::enumerate(splitDimsSetForL2)) { + if (splitDim != dim) return failure(); + } + return success(); +} + +/// This utility helps to perform the computation of offsets for L3 source. +/// +/// Example: +/// For L3 -> L2 DmaCpyNd :- +/// From offset (0,0) we are extracting one 4x4 memref. +/// _______ +/// |. . . .| +/// |. . . .| +/// |. . . .| +/// |. . . .| +/// --------- +/// After split we will extract four 2x2 memrefs. +/// So, the corresponding offsets will be :- +/// 1. Offset (0,0) - extract 2x2 memref +/// ___ +/// |. .|. . +/// |. .|. . +/// ----- +/// . . . . +/// . . . . +/// 2. Offset (0,2) - extract 2x2 memref +/// ___ +/// . .|. .| +/// . .|. .| +/// ----- +/// . . . . +/// . . . . +/// 3. Offset (2,0) - extract 2x2 memref +/// . . . . +/// . . . . +/// ___ +/// |. .|. . +/// |. .|. . +/// ----- +/// 4. Offset (2,2) - extract 2x2 memref +/// . . . . +/// . . . . +/// ___ +/// . .|. .| +/// . .|. .| +/// ----- +static FailureOr updateL3SourceOffset(IRRewriter &rewriter, + OpFoldResult oldL3Offset, + int64_t offsetToAdd, + MLIRContext *context) { + auto createAffineMap = [&](AffineExpr affineExpr, + int64_t offsetToAdd) -> AffineMap { + AffineExpr newAffineExpr = affineExpr + offsetToAdd; + return AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0, {newAffineExpr}, + context); + }; + OpFoldResult newL3AsSourceOffset; + OpBuilder::InsertionGuard guard(rewriter); + if (auto l3SourceOffsetAttr = dyn_cast(oldL3Offset)) { + int64_t l3SourceOffsetIntVal = + cast(l3SourceOffsetAttr).getInt(); + int64_t newOffset = l3SourceOffsetIntVal + offsetToAdd; + newL3AsSourceOffset = rewriter.getIndexAttr(newOffset); + } else { + auto l3SourceOffsetVal = cast(oldL3Offset); + if (auto blockArg = dyn_cast(l3SourceOffsetVal)) { + Operation *ownerOfBlockArg = blockArg.getOwner()->getParentOp(); + rewriter.setInsertionPointToStart(blockArg.getOwner()); + AffineExpr affineExpr = rewriter.getAffineDimExpr(0); + AffineMap newAffineMap = createAffineMap(affineExpr, offsetToAdd); + newL3AsSourceOffset = + rewriter + .create(ownerOfBlockArg->getLoc(), + newAffineMap, l3SourceOffsetVal) + .getResult(); + } else { + Operation *defOpOfL3SourceOffset = l3SourceOffsetVal.getDefiningOp(); + Location loc = defOpOfL3SourceOffset->getLoc(); + rewriter.setInsertionPoint(defOpOfL3SourceOffset); + if (auto applyOp = + dyn_cast(defOpOfL3SourceOffset)) { + AffineExpr affineExpr = applyOp.getAffineMap().getResult(0); + AffineMap newAffineMap = createAffineMap(affineExpr, offsetToAdd); + newL3AsSourceOffset = + rewriter + .create(loc, newAffineMap, + applyOp.getMapOperands()) + .getResult(); + } else if (auto constantOffset = getConstantIntValue(l3SourceOffsetVal)) { + int64_t newOffset = *constantOffset + offsetToAdd; + newL3AsSourceOffset = rewriter.getIndexAttr(newOffset); + } else { + return failure(); + } + } + } + return newL3AsSourceOffset; +} + +/// A struct utility to encapsulate all the data required to perform splitting +/// of logicalobjectfifos. +struct SplittingLogicalObjectFifoData { + SmallVector l2ToL1DmaOps; + SmallVector splitDimsForL2; + SmallVector nonSplitDimsForL2; + AMDAIE::DmaCpyNdOp l3ToL2DmaOp; +}; + +/// Utility to check whether splitting of logicalobjectfifos can be performed. +/// If possible, it populates the struct `SplittingLogicalObjectFifoData` with +/// the data required to perform the actual splitting. +static LogicalResult checkWhetherSplitIsPossible( + SplittingLogicalObjectFifoData &splittingLogicalObjectFifoData) { + SmallVector l2ToL1DmaOps = + splittingLogicalObjectFifoData.l2ToL1DmaOps; + + if (l2ToL1DmaOps.size() == 0) return failure(); + + SmallVector baseSourceOffsets = + l2ToL1DmaOps[0].getSourceMixedOffsets(); + LogicalObjectFifoFromMemrefOp sourceObjectFifo = + l2ToL1DmaOps[0].getSourceObjectFifo(); + auto sourceAllocOp = + sourceObjectFifo.getMemref().getDefiningOp(); + if (!sourceAllocOp) { + LLVM_DEBUG(llvm::dbgs() << "expected alloc op as the defining op of " + << sourceObjectFifo << "\n"); + return failure(); + } + + // We will now capture those dimensions where L2 memory was split. The way we + // do this is by checking all L2->L1 DmaOps' source offset and marking those + // dimensions which are not equal to at least one of the source offsets. + DenseSet splitDimsSetForL2; + SmallVector splitDimsForL2; + for (unsigned i = 1, n = l2ToL1DmaOps.size(); i < n; i++) { + if (l2ToL1DmaOps[i].getSourceObjectFifo() != sourceObjectFifo) { + LLVM_DEBUG(llvm::dbgs() + << l2ToL1DmaOps[i] << " does not have " << sourceObjectFifo + << " as the source objectfifo\n"); + return failure(); + } + SmallVector sourceOffsets = + l2ToL1DmaOps[i].getSourceMixedOffsets(); + for (unsigned j = 0, m = baseSourceOffsets.size(); j < m; j++) { + if (baseSourceOffsets[j] != sourceOffsets[j] && + !splitDimsSetForL2.contains(j)) { + splitDimsForL2.push_back(j); + splitDimsSetForL2.insert(j); + } + } + } + std::sort(splitDimsForL2.begin(), splitDimsForL2.end()); + + if (failed(checkIsRangeFromZero(splitDimsForL2))) { + LLVM_DEBUG(llvm::dbgs() << "cannot split L2 logicalobjectfifo because of " + "non-contiguous split dimensions inferred\n"); + return failure(); + } + + // Fetch the L3 -> L2 Dma Op corresponding to the L2 buffer as target. + SmallVector l3ToL2DmaOps; + AMDAIE::DmaCpyNdOp l3ToL2DmaOp; + for (Operation *objFifoUserOp : sourceObjectFifo->getUsers()) { + if (auto dmaOp = dyn_cast(objFifoUserOp); + dmaOp.getTargetObjectFifo() == sourceObjectFifo) { + l3ToL2DmaOps.push_back(dmaOp); + } + } + if (l3ToL2DmaOps.size() == 0) { + LLVM_DEBUG(llvm::dbgs() << "no corresponding L3->L2 dma op found for " + << sourceObjectFifo << "\n"); + return failure(); + } + if (l3ToL2DmaOps.size() > 1) { + LLVM_DEBUG(llvm::dbgs() << "found more than one L3->L2 dma ops for " + << sourceObjectFifo << "\n"); + return failure(); + } + l3ToL2DmaOp = l3ToL2DmaOps[0]; + if ((l3ToL2DmaOp.getTargetMixedOffsets().size() != + l3ToL2DmaOp.getSourceMixedOffsets().size()) || + (l3ToL2DmaOp.getTargetMixedSizes().size() != + l3ToL2DmaOp.getSourceMixedSizes().size()) || + (l3ToL2DmaOp.getTargetMixedStrides().size() != + l3ToL2DmaOp.getSourceMixedStrides().size())) { + LLVM_DEBUG(llvm::dbgs() << "dimensionality of source and target's " + "offset/size/stride found different for " + << l3ToL2DmaOp << "\n"); + return failure(); + } + + SmallVector staticL2AsTargetSizes = + l3ToL2DmaOp.getTargetMixedSizes(); + SmallVector nonSplitDimsForL2(staticL2AsTargetSizes.size() - + splitDimsForL2.size()); + std::iota(nonSplitDimsForL2.begin(), nonSplitDimsForL2.end(), + splitDimsForL2.size()); + + for (AMDAIE::DmaCpyNdOp l2ToL1DmaOp : l2ToL1DmaOps) { + SmallVector staticL2AsSourceOffsets = + l2ToL1DmaOp.getSourceMixedOffsets(); + for (auto &&[splitDim, nonSplitdim] : + llvm::zip_equal(splitDimsForL2, nonSplitDimsForL2)) { + std::optional constantVal = + getConstantIntValue(staticL2AsSourceOffsets[splitDim]); + if (!constantVal) { + LLVM_DEBUG(llvm::dbgs() + << "found a non-constant value for source offset at dim " + << splitDim << " for " << l2ToL1DmaOp << "\n"); + return failure(); + } + constantVal = getConstantIntValue(staticL2AsTargetSizes[nonSplitdim]); + if (!constantVal) { + LLVM_DEBUG(llvm::dbgs() + << "found a non-constant value for target size at dim " + << nonSplitdim << " for " << l3ToL2DmaOp << "\n"); + return failure(); + } + } + } + splittingLogicalObjectFifoData.splitDimsForL2 = splitDimsForL2; + splittingLogicalObjectFifoData.nonSplitDimsForL2 = nonSplitDimsForL2; + splittingLogicalObjectFifoData.l3ToL2DmaOp = l3ToL2DmaOp; + return success(); +} + +// Given a vector of L2->L1 Dma ops' perform the splitting :- +// 1. Check if the splitting can be performed or not. If not possible, bail out. +// 2. For the split dimension inferred set offset = 0 and size as 1 for L2 and +// L3. +// 3. Now traverse each L2->L1 Dma op and perform the following :- +// a) Create a new L2 AllocOp based on the updated size (step 3 above) and +// create a logicalobjectfifo using the same. +// b) Split L3->L2 Dma op. +// c) SPlit L2->L1 Dma op. +// 4. Delete old L2->L1, L3->L2 and corresponding AllocOps. +LogicalResult splitLogicalObjectFifos( + IRRewriter &rewriter, SmallVector &l2ToL1DmaOps, + MLIRContext *context) { + SplittingLogicalObjectFifoData splittingLogicalObjectFifoData; + splittingLogicalObjectFifoData.l2ToL1DmaOps = l2ToL1DmaOps; + if (failed(checkWhetherSplitIsPossible(splittingLogicalObjectFifoData))) { + LLVM_DEBUG(llvm::dbgs() + << "Cannot perform splitting of logicalobjectfifos"); + return success(); + } + OpBuilder::InsertionGuard guard(rewriter); + SmallVector splitDimsForL2 = + splittingLogicalObjectFifoData.splitDimsForL2; + SmallVector nonSplitDimsForL2 = + splittingLogicalObjectFifoData.nonSplitDimsForL2; + AMDAIE::DmaCpyNdOp l3ToL2DmaOp = splittingLogicalObjectFifoData.l3ToL2DmaOp; + + LogicalObjectFifoFromMemrefOp sourceObjectFifo = + l2ToL1DmaOps[0].getSourceObjectFifo(); + auto sourceAllocOp = + sourceObjectFifo.getMemref().getDefiningOp(); + + DenseSet toBeErased; + toBeErased.insert(l3ToL2DmaOp); + toBeErased.insert(sourceAllocOp); + toBeErased.insert(sourceObjectFifo); + + SmallVector staticL2AsTargetOffsets = + l3ToL2DmaOp.getTargetMixedOffsets(); + SmallVector staticL2AsTargetSizes = + l3ToL2DmaOp.getTargetMixedSizes(); + SmallVector l2ShapeAsTarget = llvm::to_vector( + cast(l3ToL2DmaOp.getTargetObjectFifo().getMemref().getType()) + .getShape()); + SmallVector staticL3AsSourceOffsets = + l3ToL2DmaOp.getSourceMixedOffsets(); + SmallVector staticL3AsSourceSizes = + l3ToL2DmaOp.getSourceMixedSizes(); + OpFoldResult zeroVal = getAsIndexOpFoldResult(context, 0); + OpFoldResult oneVal = getAsIndexOpFoldResult(context, 1); + // Update split dimensions' offset/size for L2 as target and L3 as source. We + // can afford to do this here because it's going to be the same for all L3->L2 + // splits. Here we are setting offset = 0 and size = 1. + for (size_t dim : splitDimsForL2) { + staticL2AsTargetOffsets[dim] = zeroVal; + staticL2AsTargetSizes[dim] = oneVal; + staticL3AsSourceOffsets[dim] = zeroVal; + staticL3AsSourceSizes[dim] = oneVal; + l2ShapeAsTarget[dim] = 1; + } + + // Traverse each L2->L1 DmaCpyNd op and split them. + for (AMDAIE::DmaCpyNdOp l2ToL1DmaOp : l2ToL1DmaOps) { + SmallVector staticL2AsSourceOffsets = + l2ToL1DmaOp.getSourceMixedOffsets(); + SmallVector staticL2AsSourceSizes = + l2ToL1DmaOp.getSourceMixedSizes(); + + // Now we'll create a new L2 buffer based on the new shape inferred earlier + // via `l2ShapeAsTarget`. + rewriter.setInsertionPoint(sourceAllocOp); + LogicalObjectFifoFromMemrefOp targetObjectFifo = + l2ToL1DmaOp.getTargetObjectFifo(); + Value targetAllocOp = targetObjectFifo.getMemref(); + auto oldSourceMemRefType = cast(sourceAllocOp.getType()); + auto targetMemRefType = cast(targetAllocOp.getType()); + MemRefType newAllocType = MemRefType::get( + l2ShapeAsTarget, targetMemRefType.getElementType(), + MemRefLayoutAttrInterface{}, oldSourceMemRefType.getMemorySpace()); + auto newAllocOp = rewriter.create(rewriter.getUnknownLoc(), + newAllocType); + auto newDeallocOp = rewriter.create( + rewriter.getUnknownLoc(), newAllocOp); + newDeallocOp->moveBefore(&newAllocOp->getBlock()->back()); + auto type = cast(newAllocOp.getType()); + // Create new logicalobjectfifo.from_memref for the newly created L2 buffer. + rewriter.setInsertionPoint(l2ToL1DmaOp.getSourceObjectFifo()); + auto source = rewriter.create( + rewriter.getUnknownLoc(), LogicalObjectFifoType::get(type), + newAllocOp.getResult(), sourceObjectFifo.getTiles()); + + // -------------------------------------------- + // ---------- L3 -> L2 splitting -------------- + // -------------------------------------------- + // Update L3 source offsets for non-split dimensions. Refer doc comment of + // `updateL3SourceOffset` for the computation rationale involved. + SmallVector staticL3AsSourceOffsets = + l3ToL2DmaOp.getSourceMixedOffsets(); + for (auto &&[splitDim, nonSplitdim] : + llvm::zip_equal(splitDimsForL2, nonSplitDimsForL2)) { + std::optional constantOffset = + getConstantIntValue(staticL2AsSourceOffsets[splitDim]); + if (!constantOffset) { + return l2ToL1DmaOp->emitOpError() + << "found a non-constant value for source offset at dim " + << splitDim; + } + std::optional constantSize = + getConstantIntValue(staticL2AsTargetSizes[nonSplitdim]); + if (!constantSize) { + return l3ToL2DmaOp->emitOpError() + << "found a non-constant value for target size at dim " + << nonSplitdim; + } + int64_t offsetToAdd = constantOffset.value() * constantSize.value(); + FailureOr newOffset = updateL3SourceOffset( + rewriter, staticL3AsSourceOffsets[nonSplitdim], offsetToAdd, context); + if (failed(newOffset)) { + // TODO: Ideally we should be able to handle even +, -, *, /, etc. + // But handle this later (if at all!) as such cases might not + // arise. + return l3ToL2DmaOp->emitOpError() + << "Unhandled expression for source offset at dim " + << nonSplitdim; + } + staticL3AsSourceOffsets[nonSplitdim] = *newOffset; + } + // Create new L3 -> L2 Dma Op. + rewriter.setInsertionPoint(l3ToL2DmaOp); + rewriter.create( + l3ToL2DmaOp.getLoc(), source, llvm::ArrayRef(staticL2AsTargetOffsets), + llvm::ArrayRef(staticL2AsTargetSizes), + l3ToL2DmaOp.getTargetMixedStrides(), l3ToL2DmaOp.getSource(), + llvm::ArrayRef(staticL3AsSourceOffsets), + llvm::ArrayRef(staticL3AsSourceSizes), + l3ToL2DmaOp.getSourceMixedStrides()); + + // -------------------------------------------- + // ---------- L2 -> L1 splitting -------------- + // -------------------------------------------- + // Update split dimensions' offset/size for L2 as target . Here we are + // setting offset = 0 and size = 1. + for (unsigned dim : splitDimsForL2) { + staticL2AsSourceOffsets[dim] = zeroVal; + staticL2AsSourceSizes[dim] = oneVal; + } + + // Create new L2 -> L1 Input DmaOp. + rewriter.setInsertionPoint(l2ToL1DmaOp); + auto newL2ToL1DmaOp = rewriter.create( + l2ToL1DmaOp.getLoc(), l2ToL1DmaOp.getTarget(), + l2ToL1DmaOp.getTargetMixedOffsets(), l2ToL1DmaOp.getTargetMixedSizes(), + l2ToL1DmaOp.getTargetMixedStrides(), source, + llvm::ArrayRef(staticL2AsSourceOffsets), + llvm::ArrayRef(staticL2AsSourceSizes), + l2ToL1DmaOp.getSourceMixedStrides()); + rewriter.replaceOp(l2ToL1DmaOp, newL2ToL1DmaOp); + + // Remove old dealloc. + memref::DeallocOp oldDeallocOp; + for (Operation *userOp : sourceAllocOp->getUsers()) { + if (auto deallocUser = dyn_cast(userOp)) + oldDeallocOp = deallocUser; + } + if (oldDeallocOp) toBeErased.insert(oldDeallocOp); + } + + for (Operation *op : toBeErased) { + op->dropAllUses(); + rewriter.eraseOp(op); + } + + return success(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h new file mode 100644 index 000000000..919004949 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h @@ -0,0 +1,23 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_AMD_AIE_TRANSFORMS_AMDAIELOGICALOBJFIFOSPLITTINGUTILS_H_ +#define IREE_AMD_AIE_TRANSFORMS_AMDAIELOGICALOBJFIFOSPLITTINGUTILS_H_ + +#include "iree-amd-aie/IR/AMDAIEOps.h" + +namespace mlir::iree_compiler::AMDAIE { + +/// Utility to split logicalobjectfifos given a struct +/// `SplittingLogicalObjectFifoData` which contains all the required data to +/// perform the splitting. +LogicalResult splitLogicalObjectFifos( + IRRewriter &rewriter, SmallVector &l2ToL1DmaOps, + MLIRContext *context); + +} // namespace mlir::iree_compiler::AMDAIE + +#endif diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp new file mode 100644 index 000000000..e6736a7c9 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp @@ -0,0 +1,71 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h" +#include "iree-amd-aie/Transforms/Passes.h" +// #include "llvm/Support/Debug.h" +#include "mlir/IR/Iterators.h" +#include "mlir/Pass/Pass.h" + +#define DEBUG_TYPE "iree-amdaie-split-logical-objectfifos-for-connection-reuse" + +namespace mlir::iree_compiler::AMDAIE { + +namespace { + +/// Utility to help fetch those input DmaCpyNd Ops which needs to be split. +static SmallVector fetchDmaCpyNdOpsToSplit( + ModuleOp moduleOp) { + SmallVector l2ToL1DmaOps; + // We are currently walking through CoreOps gathering 3rd Input DmaOp (if + // applicable) from them. + // TODO(avarma): We will generalize this later. + moduleOp.walk([&](AMDAIE::CoreOp coreOp) { + SmallVector inputDmas = coreOp.getInputDmas(); + if (inputDmas.size() != 3) return WalkResult::skip(); + auto dmaCpyNdOp = inputDmas[2].getDefiningOp(); + assert(dmaCpyNdOp && "expected an amdaie.dma_cpy_nd op"); + l2ToL1DmaOps.push_back(dmaCpyNdOp); + return WalkResult::advance(); + }); + return l2ToL1DmaOps; +} + +class AMDAIESplitLogicalObjFifosForConnectionReusePass + : public impl::AMDAIESplitLogicalObjFifosForConnectionReuseBase< + AMDAIESplitLogicalObjFifosForConnectionReusePass> { + public: + using AMDAIESplitLogicalObjFifosForConnectionReuseBase:: + AMDAIESplitLogicalObjFifosForConnectionReuseBase; + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + void runOnOperation() override; +}; + +void AMDAIESplitLogicalObjFifosForConnectionReusePass::runOnOperation() { + ModuleOp moduleOp = getOperation(); + MLIRContext *context = &getContext(); + IRRewriter rewriter(context); + + SmallVector l2ToL1DmaOps = + fetchDmaCpyNdOpsToSplit(moduleOp); + + if (failed(splitLogicalObjectFifos(rewriter, l2ToL1DmaOps, context))) { + LLVM_DEBUG(llvm::dbgs() + << "Failed to perform splitting of logicalobjectfifos"); + return signalPassFailure(); + } +} + +} // namespace + +std::unique_ptr createAMDAIESplitLogicalObjFifosForConnectionReusePass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index d2f21f7d5..673763df4 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -74,6 +74,7 @@ iree_cc_library( "AMDAIEInsertLoopsForVectorization.cpp" "AMDAIELinkExecutables.cpp" "AMDAIELocalizeLogicalObjectFifo.cpp" + "AMDAIELogicalObjFifoSplittingUtils.cpp" "AMDAIELowerExecutableTarget.cpp" "AMDAIELowerFuncArgs.cpp" "AMDAIELowerToAIE.cpp" @@ -88,6 +89,7 @@ iree_cc_library( "AMDAIEPeelForLoop.cpp" "AMDAIEPropagateDataLayout.cpp" "AMDAIESinkIntoCore.cpp" + "AMDAIESplitLogicalObjFifosForConnectionReuse.cpp" "AMDAIETile.cpp" "AMDAIETileAndFuse.cpp" "AMDAIEUtils.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 9ef92c268..02918677e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -68,6 +68,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIEPEELFORLOOP #define GEN_PASS_DEF_AMDAIEPROPAGATEDATALAYOUT #define GEN_PASS_DEF_AMDAIESINKINTOCORE +#define GEN_PASS_DEF_AMDAIESPLITLOGICALOBJFIFOSFORCONNECTIONREUSE #define GEN_PASS_DEF_AMDAIETILE #define GEN_PASS_DEF_AMDAIETILEANDFUSE #define GEN_PASS_DEF_AMDAIEVECTORIZATION diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 90b1bcfd8..729da7457 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -594,6 +594,7 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) { passManager.addPass(createAMDAIEDistributeCoresAndObjectFifosPass()); passManager.addPass(createCSEPass()); passManager.addPass(createCanonicalizerPass()); + passManager.addPass(createAMDAIESplitLogicalObjFifosForConnectionReusePass()); passManager.addPass(createAMDAIEDmaToCircularDmaPass()); passManager.addNestedPass(createAMDAIECreateAIEWorkgroupPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index 0a901bd5b..2632ee8fb 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -224,6 +224,9 @@ std::unique_ptr createAMDAIEPeelForLoopPass( /// Create a pass to sink all dependencies into `amdaie.core` operations. std::unique_ptr createAMDAIESinkIntoCorePass(); +/// Create a pass to split logicalobjectfifos for connection reuse. +std::unique_ptr createAMDAIESplitLogicalObjFifosForConnectionReusePass(); + /// Create pass to tile TilingInterface operations. std::unique_ptr createAMDAIETilePass(AMDAIETileOptions options = {}); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index 3da414445..fdc38eb12 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -470,6 +470,12 @@ def AMDAIESinkIntoCore : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESinkIntoCorePass()"; } +def AMDAIESplitLogicalObjFifosForConnectionReuse : + Pass<"iree-amdaie-split-logical-objectfifos-for-connection-reuse", "ModuleOp"> { + let summary = "Pass to split L2 buffers to share inputs of Matmul and Elementwise operations."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESplitLogicalObjFifosForConnectionReusePass()"; +} + def AMDAIETile : InterfacePass<"iree-amdaie-tile", "mlir::FunctionOpInterface"> { let summary = "Pass to tile TilingInterface operations."; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index 319166c69..b09d08f37 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -60,6 +60,7 @@ iree_lit_test_suite( "peel_for_loop.mlir" "propagate_data_layout.mlir" "sink_into_core.mlir" + "split_logicalobjfifos_for_connection_reuse.mlir" "tile_and_fuse_using_scf_for.mlir" "tile_and_fuse_using_scf_forall.mlir" "tile_copy_using_scf_for.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos_for_connection_reuse.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos_for_connection_reuse.mlir new file mode 100644 index 000000000..2e716dddd --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos_for_connection_reuse.mlir @@ -0,0 +1,1398 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s + +// Glossary: +// candidate core op : they are those amdaie.core ops which have at least three input dma ops. +// non-candidate core op : they are those amdaie.core ops which have less than three input dma ops. + +// Test non-candidate core op. + +// CHECK-LABEL: @split_l2_buffer_no_candidate_core_op +// CHECK-NOT: memref<1x1x32x32xi32, 1 : i32> +#map = affine_map<(d0) -> (d0 * 64)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)> +#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)> +#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)> +module { + func.func @split_l2_buffer_no_candidate_core_op(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { + %c3 = arith.constant 3 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> + %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32> + %alloc_2 = memref.alloc() : memref<128x128xi32> + %tile = amdaie.tile(%c1, %c3) + %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg4, %arg5) in (2, 2) { + %2 = affine.apply #map(%arg5) + %3 = affine.apply #map(%arg4) + %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %tile_4 = amdaie.tile(%c1, %c3) + %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %12 = amdaie.core(%tile_4, in : [%7], out : []) { + %13 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + linalg.fill ins(%c0_i32 : i32) outs(%13 : memref<1x1x4x8x4x8xi32, 2 : i32>) + amdaie.end + } + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_2 : memref<128x128xi32> + return + } +} + +// ----- + +// Test two candidate core ops. + +// CHECK-DAG: #map = affine_map<(d0) -> (d0 * 64)> +// CHECK-DAG: #map1 = affine_map<(d0) -> (d0 * 64 + 32)> +// CHECK: @split_l2_buffer_two_core_ops +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32> +// CHECK-DAG: %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> +// CHECK-DAG: %[[TILE_0:.*]] = amdaie.tile(%[[C1]], %[[C3]]) +// CHECK-DAG: %[[TILE_1:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK: %[[L2_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_0]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L2_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_1]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} : +// CHECK-SAME: memref<128x128xi32> -> !amdaie.logicalobjectfifo> +// CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2) +// CHECK-DAG: %[[IV1_0:.*]] = affine.apply #map(%[[IV1]]) +// CHECK-DAG: %[[IV1_32:.*]] = affine.apply #map1(%[[IV1]]) +// CHECK-DAG: %[[IV0_0:.*]] = affine.apply #map(%[[IV0]]) +// CHECK-DAG: %[[IV0_32:.*]] = affine.apply #map1(%[[IV0]]) +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_0:.*]], %[[IV1_0:.*]]] [1, 1, 32, 32] [4096, 32, 128, 1] +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32:.*]], %[[IV1_32:.*]]] [1, 1, 32, 32] [4096, 32, 128, 1] +// CHECK: amdaie.logicalobjectfifo.from_memref +// CHECK: amdaie.logicalobjectfifo.from_memref +// CHECK: amdaie.dma_cpy_nd +// CHECK: amdaie.dma_cpy_nd +// CHECK: %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_0]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out : +// CHECK: linalg.generic +// CHECK: } +// CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out : +// CHECK: linalg.generic +// CHECK: } +// CHECK: memref.dealloc %[[L2_ALLOC_0]] : memref<1x1x32x32xi32, 1 : i32> +// CHECK: memref.dealloc %[[L2_ALLOC_1]] : memref<1x1x32x32xi32, 1 : i32> +#map = affine_map<(d0) -> (d0 * 64)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)> +#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)> +#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)> +module { + func.func @split_l2_buffer_two_core_ops(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { + %c3 = arith.constant 3 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> + %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> + %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32> + %alloc_2 = memref.alloc() : memref<128x128xi32> + %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %tile = amdaie.tile(%c1, %c3) + %tile_4 = amdaie.tile(%c0, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg4, %arg5) in (2, 2) { + %2 = affine.apply #map(%arg5) + %3 = affine.apply #map(%arg4) + %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %tile_5 = amdaie.tile(%c1, %c3) + %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %11 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %12 = amdaie.core(%tile_5, in : [%7, %8, %10], out : [%11]) { + %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.muli %in, %in_6 : i32 + %22 = arith.addi %out, %21 : i32 + linalg.yield %22 : i32 + } + %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.addi %in, %in_6 : i32 + linalg.yield %21 : i32 + } + amdaie.end + } + %13 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %14 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %15 = amdaie.core(%tile_4, in : [%7, %8, %14], out : [%11]) { + %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.muli %in, %in_6 : i32 + %22 = arith.addi %out, %21 : i32 + linalg.yield %22 : i32 + } + %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.addi %in, %in_6 : i32 + linalg.yield %21 : i32 + } + amdaie.end + } + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32> + memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_2 : memref<128x128xi32> + return + } +} + +// ----- + +// Test mix of candidate and non-candidate core ops. + +// CHECK-DAG: #map = affine_map<(d0) -> (d0 * 64)> +// CHECK-DAG: #map1 = affine_map<(d0) -> (d0 * 64 + 32)> +// CHECK: @split_l2_buffer_mixed_core_ops +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32> +// CHECK-DAG: %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[L2_ALLOC_2:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[L2_ALLOC_3:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-NOT: memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-NOT: memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> +// CHECK-DAG: %[[TILE_0:.*]] = amdaie.tile(%[[C1]], %[[C3]]) +// CHECK-DAG: %[[TILE_1:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: %[[TILE_2:.*]] = amdaie.tile(%[[C1]], %[[C2]]) +// CHECK-DAG: %[[TILE_3:.*]] = amdaie.tile(%[[C0]], %[[C3]]) +// CHECK: %[[L2_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_0]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L2_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_1]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L2_OBJECTFIFO_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_2]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L2_OBJECTFIFO_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_3]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} : +// CHECK-SAME: memref<128x128xi32> -> !amdaie.logicalobjectfifo> +// CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2) +// CHECK-DAG: %[[IV1_0:.*]] = affine.apply #map(%[[IV1]]) +// CHECK-DAG: %[[IV1_32:.*]] = affine.apply #map1(%[[IV1]]) +// CHECK-DAG: %[[IV0_0:.*]] = affine.apply #map(%[[IV0]]) +// CHECK-DAG: %[[IV0_32:.*]] = affine.apply #map1(%[[IV0]]) +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_0]], %[[IV1_0]]] [1, 1, 32, 32] [4096, 32, 128, 1] +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_0]], %[[IV1_32]]] [1, 1, 32, 32] [4096, 32, 128, 1] +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_2:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_2]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32]], %[[IV1_0]]] [1, 1, 32, 32] [4096, 32, 128, 1] +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_3:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_3]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32]], %[[IV1_32]]] [1, 1, 32, 32] [4096, 32, 128, 1] +// CHECK: amdaie.logicalobjectfifo.from_memref +// CHECK: amdaie.logicalobjectfifo.from_memref +// CHECK: amdaie.dma_cpy_nd +// CHECK: amdaie.dma_cpy_nd +// CHECK: amdaie.core(%[[TILE_0]] +// CHECK: linalg.fill +// CHECK: amdaie.end +// CHECK: } +// CHECK: %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out : +// CHECK: linalg.generic +// CHECK: amdaie.end +// CHECK: } +// CHECK: amdaie.core(%[[TILE_2]] +// CHECK: linalg.fill +// CHECK: amdaie.end +// CHECK: } +// CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_0]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out : +// CHECK: linalg.generic +// CHECK: amdaie.end +// CHECK: } +// CHECK: %[[L1_OBJECTFIFO_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_2]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_2:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_2]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_2]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_2]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_2]]], out : +// CHECK: linalg.generic +// CHECK: amdaie.end +// CHECK: } +// CHECK: %[[L1_OBJECTFIFO_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_3]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_3:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_3]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_3]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_3]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_3]]], out : +// CHECK: linalg.generic +// CHECK: amdaie.end +// CHECK: } +// CHECK-DAG: memref.dealloc %[[L2_ALLOC_0]] : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: memref.dealloc %[[L2_ALLOC_1]] : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: memref.dealloc %[[L2_ALLOC_2]] : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: memref.dealloc %[[L2_ALLOC_3]] : memref<1x1x32x32xi32, 1 : i32> +// CHECK-NOT: memref.dealloc %{{.*}} : memref<1x1x32x32xi32, 1 : i32> +// CHECK-NOT: memref.dealloc %{{.*}} : memref<1x1x32x32xi32, 1 : i32> +#map = affine_map<(d0) -> (d0 * 64)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)> +#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)> +#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)> +module { + func.func @split_l2_buffer_mixed_core_ops(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { + %c3 = arith.constant 3 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> + %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> + %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32> + %alloc_2 = memref.alloc() : memref<128x128xi32> + %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %tile = amdaie.tile(%c1, %c3) + %tile_4 = amdaie.tile(%c0, %c2) + %tile_5 = amdaie.tile(%c1, %c2) + %tile_6 = amdaie.tile(%c0, %c3) + %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg4, %arg5) in (2, 2) { + %2 = affine.apply #map(%arg5) + %3 = affine.apply #map(%arg4) + %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %tile_7 = amdaie.tile(%c1, %c3) + %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %9 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %10 = amdaie.core(%tile_7, in : [%7], out : []) { + %24 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + linalg.fill ins(%c0_i32 : i32) outs(%24 : memref<1x1x4x8x4x8xi32, 2 : i32>) + amdaie.end + } + %11 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %12 = amdaie.dma_cpy_nd(%11[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %13 = amdaie.core(%tile_4, in : [%7, %8, %12], out : [%9]) { + %24 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %25 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %26 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%24, %25 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%26 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_8: i32, %out: i32): + %29 = arith.muli %in, %in_8 : i32 + %30 = arith.addi %out, %29 : i32 + linalg.yield %30 : i32 + } + %27 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %28 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%26, %27 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%28 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_8: i32, %out: i32): + %29 = arith.addi %in, %in_8 : i32 + linalg.yield %29 : i32 + } + amdaie.end + } + %14 = amdaie.core(%tile_5, in : [%7], out : []) { + %24 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + linalg.fill ins(%c0_i32 : i32) outs(%24 : memref<1x1x4x8x4x8xi32, 2 : i32>) + amdaie.end + } + %15 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %16 = amdaie.dma_cpy_nd(%15[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %17 = amdaie.core(%tile, in : [%7, %8, %16], out : [%9]) { + %24 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %25 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %26 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%24, %25 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%26 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_8: i32, %out: i32): + %29 = arith.muli %in, %in_8 : i32 + %30 = arith.addi %out, %29 : i32 + linalg.yield %30 : i32 + } + %27 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %28 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%26, %27 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%28 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_8: i32, %out: i32): + %29 = arith.addi %in, %in_8 : i32 + linalg.yield %29 : i32 + } + amdaie.end + } + %18 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_5} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %19 = amdaie.dma_cpy_nd(%18[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %20 = amdaie.core(%tile_5, in : [%7, %8, %19], out : [%9]) { + %24 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %25 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %26 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%24, %25 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%26 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_8: i32, %out: i32): + %29 = arith.muli %in, %in_8 : i32 + %30 = arith.addi %out, %29 : i32 + linalg.yield %30 : i32 + } + %27 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %28 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%26, %27 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%28 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_8: i32, %out: i32): + %29 = arith.addi %in, %in_8 : i32 + linalg.yield %29 : i32 + } + amdaie.end + } + %21 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_6} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %22 = amdaie.dma_cpy_nd(%21[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %23 = amdaie.core(%tile_6, in : [%7, %8, %22], out : [%9]) { + %24 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %25 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %26 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%24, %25 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%26 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_8: i32, %out: i32): + %29 = arith.muli %in, %in_8 : i32 + %30 = arith.addi %out, %29 : i32 + linalg.yield %30 : i32 + } + %27 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %28 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%26, %27 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%28 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_8: i32, %out: i32): + %29 = arith.addi %in, %in_8 : i32 + linalg.yield %29 : i32 + } + amdaie.end + } + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32> + memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_2 : memref<128x128xi32> + return + } +} + +// ----- + +// Test splitting buffer logic even if we don't have affine.apply ops and have mere constants +// in L3 source's offset. + +// CHECK: @split_l2_buffer_mixed_core_ops_l3_source_as_constant +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32> +// CHECK-DAG: %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[L2_ALLOC_2:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[L2_ALLOC_3:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-NOT: memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-NOT: memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> +// CHECK-DAG: %[[TILE_0:.*]] = amdaie.tile(%[[C1]], %[[C3]]) +// CHECK-DAG: %[[TILE_1:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: %[[TILE_2:.*]] = amdaie.tile(%[[C1]], %[[C2]]) +// CHECK-DAG: %[[TILE_3:.*]] = amdaie.tile(%[[C0]], %[[C3]]) +// CHECK: %[[L2_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_0]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L2_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_1]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L2_OBJECTFIFO_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_2]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L2_OBJECTFIFO_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_3]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} : +// CHECK-SAME: memref<128x128xi32> -> !amdaie.logicalobjectfifo> +// CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2) +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, 3, 2] [1, 1, 32, 32] [4096, 32, 128, 1] +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, 3, 34] [1, 1, 32, 32] [4096, 32, 128, 1] +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_2:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_2]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, 35, 2] [1, 1, 32, 32] [4096, 32, 128, 1] +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_3:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_3]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, 35, 34] [1, 1, 32, 32] [4096, 32, 128, 1] +// CHECK: amdaie.logicalobjectfifo.from_memref +// CHECK: amdaie.logicalobjectfifo.from_memref +// CHECK: amdaie.dma_cpy_nd +// CHECK: amdaie.dma_cpy_nd +// CHECK: amdaie.core(%[[TILE_0]] +// CHECK: linalg.fill +// CHECK: amdaie.end +// CHECK: } +// CHECK: %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out : +// CHECK: linalg.generic +// CHECK: amdaie.end +// CHECK: } +// CHECK: amdaie.core(%[[TILE_2]] +// CHECK: linalg.fill +// CHECK: amdaie.end +// CHECK: } +// CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_0]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out : +// CHECK: linalg.generic +// CHECK: amdaie.end +// CHECK: } +// CHECK: %[[L1_OBJECTFIFO_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_2]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_2:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_2]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_2]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_2]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_2]]], out : +// CHECK: linalg.generic +// CHECK: amdaie.end +// CHECK: } +// CHECK: %[[L1_OBJECTFIFO_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_3]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_3:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_3]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_3]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_3]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_3]]], out : +// CHECK: linalg.generic +// CHECK: amdaie.end +// CHECK: } +// CHECK-DAG: memref.dealloc %[[L2_ALLOC_0]] : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: memref.dealloc %[[L2_ALLOC_1]] : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: memref.dealloc %[[L2_ALLOC_2]] : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: memref.dealloc %[[L2_ALLOC_3]] : memref<1x1x32x32xi32, 1 : i32> +// CHECK-NOT: memref.dealloc %{{.*}} : memref<1x1x32x32xi32, 1 : i32> +// CHECK-NOT: memref.dealloc %{{.*}} : memref<1x1x32x32xi32, 1 : i32> +#map = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)> +#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)> +module { + func.func @split_l2_buffer_mixed_core_ops_l3_source_as_constant(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { + %c3 = arith.constant 3 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> + %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> + %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32> + %alloc_2 = memref.alloc() : memref<128x128xi32> + %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %tile = amdaie.tile(%c1, %c3) + %tile_4 = amdaie.tile(%c0, %c2) + %tile_5 = amdaie.tile(%c1, %c2) + %tile_6 = amdaie.tile(%c0, %c3) + %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg4, %arg5) in (2, 2) { + %2 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %c3, %c2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %tile_7 = amdaie.tile(%c1, %c3) + %3 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %4 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %5 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %3[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %6 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %4[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %7 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.core(%tile_7, in : [%5], out : []) { + %22 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + linalg.fill ins(%c0_i32 : i32) outs(%22 : memref<1x1x4x8x4x8xi32, 2 : i32>) + amdaie.end + } + %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %11 = amdaie.core(%tile_4, in : [%5, %6, %10], out : [%7]) { + %22 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %23 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %24 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%22, %23 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%24 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_8: i32, %out: i32): + %27 = arith.muli %in, %in_8 : i32 + %28 = arith.addi %out, %27 : i32 + linalg.yield %28 : i32 + } + %25 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %26 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%24, %25 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%26 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_8: i32, %out: i32): + %27 = arith.addi %in, %in_8 : i32 + linalg.yield %27 : i32 + } + amdaie.end + } + %12 = amdaie.core(%tile_5, in : [%5], out : []) { + %22 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + linalg.fill ins(%c0_i32 : i32) outs(%22 : memref<1x1x4x8x4x8xi32, 2 : i32>) + amdaie.end + } + %13 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %14 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %15 = amdaie.core(%tile, in : [%5, %6, %14], out : [%7]) { + %22 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %23 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %24 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%22, %23 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%24 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_8: i32, %out: i32): + %27 = arith.muli %in, %in_8 : i32 + %28 = arith.addi %out, %27 : i32 + linalg.yield %28 : i32 + } + %25 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %26 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%24, %25 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%26 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_8: i32, %out: i32): + %27 = arith.addi %in, %in_8 : i32 + linalg.yield %27 : i32 + } + amdaie.end + } + %16 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_5} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %17 = amdaie.dma_cpy_nd(%16[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %18 = amdaie.core(%tile_5, in : [%5, %6, %17], out : [%7]) { + %22 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %23 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %24 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%22, %23 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%24 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_8: i32, %out: i32): + %27 = arith.muli %in, %in_8 : i32 + %28 = arith.addi %out, %27 : i32 + linalg.yield %28 : i32 + } + %25 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %26 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%24, %25 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%26 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_8: i32, %out: i32): + %27 = arith.addi %in, %in_8 : i32 + linalg.yield %27 : i32 + } + amdaie.end + } + %19 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_6} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %20 = amdaie.dma_cpy_nd(%19[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %21 = amdaie.core(%tile_6, in : [%5, %6, %20], out : [%7]) { + %22 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %23 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %24 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%22, %23 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%24 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_8: i32, %out: i32): + %27 = arith.muli %in, %in_8 : i32 + %28 = arith.addi %out, %27 : i32 + linalg.yield %28 : i32 + } + %25 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %26 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%24, %25 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%26 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_8: i32, %out: i32): + %27 = arith.addi %in, %in_8 : i32 + linalg.yield %27 : i32 + } + amdaie.end + } + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32> + memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_2 : memref<128x128xi32> + return + } +} + +// ----- + +// Test splitting of logicalobjectfifo when offset of L3 source is an attribute. + +// CHECK: @attribute_offset_for_l3_source +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32> +// CHECK-DAG: %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> +// CHECK-DAG: %[[TILE_0:.*]] = amdaie.tile(%[[C1]], %[[C3]]) +// CHECK-DAG: %[[TILE_1:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK: %[[L2_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_0]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L2_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_1]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} : +// CHECK-SAME: memref<128x128xi32> -> !amdaie.logicalobjectfifo> +// CHECK: scf.forall +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, 3, 2] [1, 1, 32, 32] [4096, 32, 128, 1] +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, 35, 34] [1, 1, 32, 32] [4096, 32, 128, 1] +// CHECK: amdaie.logicalobjectfifo.from_memref +// CHECK: amdaie.logicalobjectfifo.from_memref +// CHECK: amdaie.dma_cpy_nd +// CHECK: amdaie.dma_cpy_nd +// CHECK: %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_0]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out : +// CHECK: linalg.generic +// CHECK: } +// CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out : +// CHECK: linalg.generic +// CHECK: } +// CHECK: memref.dealloc %[[L2_ALLOC_0]] : memref<1x1x32x32xi32, 1 : i32> +// CHECK: memref.dealloc %[[L2_ALLOC_1]] : memref<1x1x32x32xi32, 1 : i32> +#map = affine_map<(d0) -> (d0 * 64)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)> +#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)> +#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)> +module { + func.func @attribute_offset_for_l3_source(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { + %c3 = arith.constant 3 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> + %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> + %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32> + %alloc_2 = memref.alloc() : memref<128x128xi32> + %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %tile = amdaie.tile(%c1, %c3) + %tile_4 = amdaie.tile(%c0, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg4, %arg5) in (2, 2) { + %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, 3, 2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %tile_5 = amdaie.tile(%c1, %c3) + %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %11 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %12 = amdaie.core(%tile_5, in : [%7, %8, %10], out : [%11]) { + %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.muli %in, %in_6 : i32 + %22 = arith.addi %out, %21 : i32 + linalg.yield %22 : i32 + } + %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.addi %in, %in_6 : i32 + linalg.yield %21 : i32 + } + amdaie.end + } + %13 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %14 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %15 = amdaie.core(%tile_4, in : [%7, %8, %14], out : [%11]) { + %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.muli %in, %in_6 : i32 + %22 = arith.addi %out, %21 : i32 + linalg.yield %22 : i32 + } + %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.addi %in, %in_6 : i32 + linalg.yield %21 : i32 + } + amdaie.end + } + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32> + memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_2 : memref<128x128xi32> + return + } +} + +// ----- + +// Test splitting of logicalobjectfifo when offset of L3 source is a function argument. + +// CHECK-DAG: #map = affine_map<(d0) -> (d0 + 32)> +// CHECK-DAG: #map1 = affine_map<(d0) -> (d0)> +// CHECK: @block_argument_of_funcOp_offset +// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index, +// CHECK-DAG: %[[IV1_32:.*]] = affine.apply #map(%[[ARG1]]) +// CHECK-DAG: %[[IV0_32:.*]] = affine.apply #map(%[[ARG0]]) +// CHECK-DAG: %[[IV1_0:.*]] = affine.apply #map1(%[[ARG1]]) +// CHECK-DAG: %[[IV0_0:.*]] = affine.apply #map1(%[[ARG0]]) +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32> +// CHECK-DAG: %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> +// CHECK-DAG: %[[TILE_0:.*]] = amdaie.tile(%[[C1]], %[[C3]]) +// CHECK-DAG: %[[TILE_1:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK: %[[L2_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_0]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L2_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_1]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} : +// CHECK-SAME: memref<128x128xi32> -> !amdaie.logicalobjectfifo> +// CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2) +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_0:.*]], %[[IV1_0:.*]]] [1, 1, 32, 32] [4096, 32, 128, 1] +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32:.*]], %[[IV1_32:.*]]] [1, 1, 32, 32] [4096, 32, 128, 1] +// CHECK: amdaie.logicalobjectfifo.from_memref +// CHECK: amdaie.logicalobjectfifo.from_memref +// CHECK: amdaie.dma_cpy_nd +// CHECK: amdaie.dma_cpy_nd +// CHECK: %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_0]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out : +// CHECK: linalg.generic +// CHECK: } +// CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out : +// CHECK: linalg.generic +// CHECK: } +// CHECK: memref.dealloc %[[L2_ALLOC_0]] : memref<1x1x32x32xi32, 1 : i32> +// CHECK: memref.dealloc %[[L2_ALLOC_1]] : memref<1x1x32x32xi32, 1 : i32> +#map = affine_map<(d0) -> (d0 * 64)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)> +#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)> +#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)> +module { + func.func @block_argument_of_funcOp_offset(%3: index, %2: index, %arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { + %c3 = arith.constant 3 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> + %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> + %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32> + %alloc_2 = memref.alloc() : memref<128x128xi32> + %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %tile = amdaie.tile(%c1, %c3) + %tile_4 = amdaie.tile(%c0, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg4, %arg5) in (2, 2) { + %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %tile_5 = amdaie.tile(%c1, %c3) + %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %11 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %12 = amdaie.core(%tile_5, in : [%7, %8, %10], out : [%11]) { + %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.muli %in, %in_6 : i32 + %22 = arith.addi %out, %21 : i32 + linalg.yield %22 : i32 + } + %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.addi %in, %in_6 : i32 + linalg.yield %21 : i32 + } + amdaie.end + } + %13 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %14 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %15 = amdaie.core(%tile_4, in : [%7, %8, %14], out : [%11]) { + %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.muli %in, %in_6 : i32 + %22 = arith.addi %out, %21 : i32 + linalg.yield %22 : i32 + } + %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.addi %in, %in_6 : i32 + linalg.yield %21 : i32 + } + amdaie.end + } + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32> + memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_2 : memref<128x128xi32> + return + } +} + +// ----- + +// Test splitting of logicalobjectfifo when offset of L3 source is a forall induction variable. + +// CHECK-DAG: #map = affine_map<(d0) -> (d0 + 32)> +// CHECK-DAG: #map1 = affine_map<(d0) -> (d0)> +// CHECK: @block_argument_of_scf_forall_offset +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32> +// CHECK-DAG: %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> +// CHECK-DAG: %[[TILE_0:.*]] = amdaie.tile(%[[C1]], %[[C3]]) +// CHECK-DAG: %[[TILE_1:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK: %[[L2_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_0]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L2_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_1]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} : +// CHECK-SAME: memref<128x128xi32> -> !amdaie.logicalobjectfifo> +// CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2) +// CHECK-DAG: %[[IV1_32:.*]] = affine.apply #map(%[[IV1]]) +// CHECK-DAG: %[[IV1_0:.*]] = affine.apply #map1(%[[IV1]]) +// CHECK-DAG: %[[IV0_32:.*]] = affine.apply #map(%[[IV0]]) +// CHECK-DAG: %[[IV0_0:.*]] = affine.apply #map1(%[[IV0]]) +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_0:.*]], %[[IV1_0:.*]]] [1, 1, 32, 32] [4096, 32, 128, 1] +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32:.*]], %[[IV1_32:.*]]] [1, 1, 32, 32] [4096, 32, 128, 1] +// CHECK: amdaie.logicalobjectfifo.from_memref +// CHECK: amdaie.logicalobjectfifo.from_memref +// CHECK: amdaie.dma_cpy_nd +// CHECK: amdaie.dma_cpy_nd +// CHECK: %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_0]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out : +// CHECK: linalg.generic +// CHECK: } +// CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out : +// CHECK: linalg.generic +// CHECK: } +// CHECK: memref.dealloc %[[L2_ALLOC_0]] : memref<1x1x32x32xi32, 1 : i32> +// CHECK: memref.dealloc %[[L2_ALLOC_1]] : memref<1x1x32x32xi32, 1 : i32> +#map = affine_map<(d0) -> (d0 * 64)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)> +#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)> +#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)> +module { + func.func @block_argument_of_scf_forall_offset(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { + %c3 = arith.constant 3 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> + %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> + %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32> + %alloc_2 = memref.alloc() : memref<128x128xi32> + %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %tile = amdaie.tile(%c1, %c3) + %tile_4 = amdaie.tile(%c0, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg4, %arg5) in (2, 2) { + %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %arg4, %arg5] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %tile_5 = amdaie.tile(%c1, %c3) + %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %11 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %12 = amdaie.core(%tile_5, in : [%7, %8, %10], out : [%11]) { + %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.muli %in, %in_6 : i32 + %22 = arith.addi %out, %21 : i32 + linalg.yield %22 : i32 + } + %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.addi %in, %in_6 : i32 + linalg.yield %21 : i32 + } + amdaie.end + } + %13 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %14 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %15 = amdaie.core(%tile_4, in : [%7, %8, %14], out : [%11]) { + %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.muli %in, %in_6 : i32 + %22 = arith.addi %out, %21 : i32 + linalg.yield %22 : i32 + } + %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.addi %in, %in_6 : i32 + linalg.yield %21 : i32 + } + amdaie.end + } + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32> + memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_2 : memref<128x128xi32> + return + } +} + +// ----- + +// Since the L2->L1 DmaOps have different logicalobjectfifo source, splitting won't take place. + +// CHECK-LABEL: @different_logical_objectfifo +// CHECK-NOT: memref<1x1x32x32xi32, 1 : i32> +#map = affine_map<(d0) -> (d0 * 64)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)> +#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)> +#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)> +module { + func.func @different_logical_objectfifo(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { + %c3 = arith.constant 3 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> + %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> + %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32> + %alloc_2 = memref.alloc() : memref<128x128xi32> + %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %tile = amdaie.tile(%c1, %c3) + %tile_4 = amdaie.tile(%c0, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %00 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg4, %arg5) in (2, 2) { + %2 = affine.apply #map(%arg5) + %3 = affine.apply #map(%arg4) + %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %tile_5 = amdaie.tile(%c1, %c3) + %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %11 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %12 = amdaie.core(%tile_5, in : [%7, %8, %10], out : [%11]) { + %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.muli %in, %in_6 : i32 + %22 = arith.addi %out, %21 : i32 + linalg.yield %22 : i32 + } + %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.addi %in, %in_6 : i32 + linalg.yield %21 : i32 + } + amdaie.end + } + %13 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %14 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %00[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %15 = amdaie.core(%tile_4, in : [%7, %8, %14], out : [%11]) { + %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.muli %in, %in_6 : i32 + %22 = arith.addi %out, %21 : i32 + linalg.yield %22 : i32 + } + %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.addi %in, %in_6 : i32 + linalg.yield %21 : i32 + } + amdaie.end + } + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32> + memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_2 : memref<128x128xi32> + return + } +} + +// ----- + +// We want to compute L3's source offset by computing : L2 source offset * L2 target size at dim. +// But since in this test L2 source offset is not a constant, the above computation cannot take place. + +// CHECK-LABEL: @non_constant_source_l2_offset +// CHECK-NOT: memref<1x1x32x32xi32, 1 : i32> +#map = affine_map<(d0) -> (d0 * 64)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)> +#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)> +#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)> +module { + func.func @non_constant_source_l2_offset(%cst_offset: index, %arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { + %c3 = arith.constant 3 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> + %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> + %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32> + %alloc_2 = memref.alloc() : memref<128x128xi32> + %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %tile = amdaie.tile(%c1, %c3) + %tile_4 = amdaie.tile(%c0, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg4, %arg5) in (2, 2) { + %2 = affine.apply #map(%arg5) + %3 = affine.apply #map(%arg4) + %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %tile_5 = amdaie.tile(%c1, %c3) + %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %11 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %12 = amdaie.core(%tile_5, in : [%7, %8, %10], out : [%11]) { + %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.muli %in, %in_6 : i32 + %22 = arith.addi %out, %21 : i32 + linalg.yield %22 : i32 + } + %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.addi %in, %in_6 : i32 + linalg.yield %21 : i32 + } + amdaie.end + } + %13 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %14 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, %cst_offset, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %15 = amdaie.core(%tile_4, in : [%7, %8, %14], out : [%11]) { + %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.muli %in, %in_6 : i32 + %22 = arith.addi %out, %21 : i32 + linalg.yield %22 : i32 + } + %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.addi %in, %in_6 : i32 + linalg.yield %21 : i32 + } + amdaie.end + } + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32> + memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_2 : memref<128x128xi32> + return + } +} + +// ----- + +// We want to compute L3's source offset by computing : L2 source offset * L2 target size at dim. +// But since in this test L2 target size is not a constant, the above computation cannot take place. + +// CHECK-LABEL: @non_constant_target_l2_size +// CHECK-NOT: memref<1x1x32x32xi32, 1 : i32> +#map = affine_map<(d0) -> (d0 * 64)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)> +#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)> +#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)> +module { + func.func @non_constant_target_l2_size(%cst_offset: index, %arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { + %c3 = arith.constant 3 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> + %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> + %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32> + %alloc_2 = memref.alloc() : memref<128x128xi32> + %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %tile = amdaie.tile(%c1, %c3) + %tile_4 = amdaie.tile(%c0, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg4, %arg5) in (2, 2) { + %2 = affine.apply #map(%arg5) + %3 = affine.apply #map(%arg4) + %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, %cst_offset, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %tile_5 = amdaie.tile(%c1, %c3) + %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %11 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %12 = amdaie.core(%tile_5, in : [%7, %8, %10], out : [%11]) { + %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.muli %in, %in_6 : i32 + %22 = arith.addi %out, %21 : i32 + linalg.yield %22 : i32 + } + %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.addi %in, %in_6 : i32 + linalg.yield %21 : i32 + } + amdaie.end + } + %13 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %14 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %15 = amdaie.core(%tile_4, in : [%7, %8, %14], out : [%11]) { + %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.muli %in, %in_6 : i32 + %22 = arith.addi %out, %21 : i32 + linalg.yield %22 : i32 + } + %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.addi %in, %in_6 : i32 + linalg.yield %21 : i32 + } + amdaie.end + } + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32> + memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_2 : memref<128x128xi32> + return + } +} + +// ----- + +#map = affine_map<(d0) -> (d0 * 64)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)> +#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)> +#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)> +module { + func.func @unhadled_offset_expression(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { + %c3 = arith.constant 3 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %2 = arith.addi %c0, %c2 : index + %3 = arith.addi %c0, %c3 : index + %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> + %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> + %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32> + %alloc_2 = memref.alloc() : memref<128x128xi32> + %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %tile = amdaie.tile(%c1, %c3) + %tile_4 = amdaie.tile(%c0, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg4, %arg5) in (2, 2) { + // expected-error @below {{Unhandled expression for source offset at dim 2}} + %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %tile_5 = amdaie.tile(%c1, %c3) + %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %11 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %12 = amdaie.core(%tile_5, in : [%7, %8, %10], out : [%11]) { + %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.muli %in, %in_6 : i32 + %22 = arith.addi %out, %21 : i32 + linalg.yield %22 : i32 + } + %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.addi %in, %in_6 : i32 + linalg.yield %21 : i32 + } + amdaie.end + } + %13 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %14 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %15 = amdaie.core(%tile_4, in : [%7, %8, %14], out : [%11]) { + %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.muli %in, %in_6 : i32 + %22 = arith.addi %out, %21 : i32 + linalg.yield %22 : i32 + } + %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_6: i32, %out: i32): + %21 = arith.addi %in, %in_6 : i32 + linalg.yield %21 : i32 + } + amdaie.end + } + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32> + memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_2 : memref<128x128xi32> + return + } +}