diff --git a/build_tools/build_test_cpp.ps1 b/build_tools/build_test_cpp.ps1 index f190dc192..294fee6cf 100644 --- a/build_tools/build_test_cpp.ps1 +++ b/build_tools/build_test_cpp.ps1 @@ -58,7 +58,7 @@ echo "Building IREE" $CMAKE_ARGS = @( "-GNinja" - "-DCMAKE_BUILD_TYPE=Release" + "-DCMAKE_BUILD_TYPE=Debug" "-DCMAKE_INSTALL_PREFIX=$install_dir" "-DCMAKE_INSTALL_LIBDIR=lib" "-DCMAKE_EXE_LINKER_FLAGS_INIT=-fuse-ld=lld" diff --git a/build_tools/build_test_cpp.sh b/build_tools/build_test_cpp.sh index 1b0b1ac28..7bd9e96b4 100644 --- a/build_tools/build_test_cpp.sh +++ b/build_tools/build_test_cpp.sh @@ -63,7 +63,7 @@ echo '{ cd $iree_dir CMAKE_ARGS="\ -GNinja \ - -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_INSTALL_PREFIX=$install_dir \ -DCMAKE_INSTALL_LIBDIR=lib \ -DIREE_ERROR_ON_MISSING_SUBMODULES=OFF \ diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECombineLogicalObjFifosForConnectionReuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECombineLogicalObjFifosForConnectionReuse.cpp new file mode 100644 index 000000000..d7c00b260 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECombineLogicalObjFifosForConnectionReuse.cpp @@ -0,0 +1,52 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "mlir/IR/Iterators.h" +#include "mlir/Pass/Pass.h" + +#define DEBUG_TYPE \ + "iree-amdaie-combine-logical-objectfifos-for-connection-reuse" + +namespace mlir::iree_compiler::AMDAIE { + +namespace { + +class AMDAIECombineLogicalObjFifosForConnectionReusePass + : public impl::AMDAIECombineLogicalObjFifosForConnectionReuseBase< + AMDAIECombineLogicalObjFifosForConnectionReusePass> { + public: + using AMDAIECombineLogicalObjFifosForConnectionReuseBase:: + AMDAIECombineLogicalObjFifosForConnectionReuseBase; + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + void runOnOperation() override; +}; + +void AMDAIECombineLogicalObjFifosForConnectionReusePass::runOnOperation() { + ModuleOp moduleOp = getOperation(); + MLIRContext *context = &getContext(); + IRRewriter rewriter(context); + + SmallVector l2ToL1DmaOps = + fetchDmaCpyNdOpsToSplitOrCombine(moduleOp); + + if (failed(combineLogicalObjectFifos(rewriter, l2ToL1DmaOps, context))) { + return signalPassFailure(); + } +} + +} // namespace + +std::unique_ptr +createAMDAIECombineLogicalObjFifosForConnectionReusePass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp index dcba79e2b..2264cbba4 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp @@ -342,4 +342,17 @@ LogicalResult moveNpuDmaSyncUsersAfterAncestorInSameBlock( return success(); } +/// Utility to fetch a unique CoreOp associated with a L2->L1 Dma op. +std::optional fetchUniqueCoreOp(DmaCpyNdOp &l2ToL1DmaOp) { + SmallVector coreOps; + for (Operation *userOp : l2ToL1DmaOp->getUsers()) { + if (auto coreOp = dyn_cast(userOp)) { + coreOps.push_back(coreOp); + } + } + assert(coreOps.size() == 1 && + "L2->L1 Dma op expected to have a unique Core op"); + return coreOps[0]; +} + } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h index f24ed3196..0ae49d249 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h @@ -370,6 +370,9 @@ struct DmaDimConfig { LogicalResult moveNpuDmaSyncUsersAfterAncestorInSameBlock( RewriterBase &rewriter, Operation *parentOp); +/// Utility to fetch a unique CoreOp associated with a L2->L1 Dma op. +std::optional fetchUniqueCoreOp(DmaCpyNdOp &l2ToL1DmaOp); + } // namespace mlir::iree_compiler::AMDAIE #endif diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp index 420920a6e..7fb818f42 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp @@ -470,4 +470,491 @@ LogicalResult splitLogicalObjectFifos( return success(); } +static int64_t fetchOffsetBias(OpFoldResult offsetOpFoldResult) { + std::optional offset = getConstantIntValue(offsetOpFoldResult); + if (offset) return offset.value(); + auto offsetVal = cast(offsetOpFoldResult); + auto affineApplyOp = + dyn_cast_if_present(offsetVal.getDefiningOp()); + if (!affineApplyOp) return 0; + AffineMap affineMap = affineApplyOp.getAffineMap(); + RetrieveScaleAndBias retriever; + assert(!failed(retriever.visit(affineMap.getResult(0))) && + "failed to retrieve scale and bias"); + int64_t bias = 0; + if (retriever.bias) { + bias = retriever.bias.value(); + } + return bias; +} + +static LogicalResult combineL3ToL2AccessPatterns( + RewriterBase &rewriter, const SmallVector &offsetsA, + const SmallVector &sizesA, + const SmallVector &stridesA, + const SmallVector &offsetsB, + const SmallVector &sizesB, + const SmallVector &stridesB, + SmallVector &newOffsets, SmallVector &newSizes, + SmallVector &newStrides, SmallVector &combiningDims, + SmallVector &nonCombiningDims) { + if (offsetsA.empty() && offsetsB.empty()) return success(); + + int64_t newSize = 1; + for (auto iter : llvm::enumerate(llvm::zip(offsetsA, offsetsB))) { + if (iter.index() < combiningDims.size()) continue; + const OpFoldResult &offsetA = std::get<0>(iter.value()); + const OpFoldResult &offsetB = std::get<1>(iter.value()); + if (offsetA != offsetB) { + // Need to check the difference in bias here. + int64_t biasA = fetchOffsetBias(offsetA); + int64_t biasB = fetchOffsetBias(offsetB); + std::optional sizeA = getConstantIntValue(sizesA[iter.index()]); + assert(sizeA && "expected a constant integer value for size"); + if (sizeA != biasB - biasA) return failure(); + newSize++; + } + } + newSizes[combiningDims.size() - 1] = rewriter.getI64IntegerAttr(newSize); + return success(); +} + +static FailureOr combineL3ToL2Pair( + IRRewriter &rewriter, DmaCpyNdOp dmaOpA, DmaCpyNdOp dmaOpB, + SmallVector &combiningDims, + SmallVector &nonCombiningDims) { + OpBuilder::InsertionGuard guard(rewriter); + SmallVector sourceOffsetsA = dmaOpA.getSourceMixedOffsets(); + SmallVector sourceSizesA = dmaOpA.getSourceMixedSizes(); + SmallVector sourceStridesA = dmaOpA.getSourceMixedStrides(); + SmallVector sourceOffsetsB = dmaOpB.getSourceMixedOffsets(); + SmallVector sourceSizesB = dmaOpB.getSourceMixedSizes(); + SmallVector sourceStridesB = dmaOpB.getSourceMixedStrides(); + + SmallVector targetOffsetsA = dmaOpA.getTargetMixedOffsets(); + SmallVector targetSizesA = dmaOpA.getTargetMixedSizes(); + SmallVector targetStridesA = dmaOpA.getTargetMixedStrides(); + SmallVector targetOffsetsB = dmaOpB.getTargetMixedOffsets(); + SmallVector targetSizesB = dmaOpB.getTargetMixedSizes(); + SmallVector targetStridesB = dmaOpB.getTargetMixedStrides(); + + SmallVector newSourceOffsets = sourceOffsetsA; + SmallVector newSourceSizes = sourceSizesA; + SmallVector newSourceStrides = sourceStridesA; + if (failed(combineL3ToL2AccessPatterns( + rewriter, sourceOffsetsA, sourceSizesA, sourceStridesA, + sourceOffsetsB, sourceSizesB, sourceStridesB, newSourceOffsets, + newSourceSizes, newSourceStrides, combiningDims, nonCombiningDims))) { + dmaOpA->emitOpError() + << "L3->L2 pair cannot be combined because offset is not contiguous"; + return failure(); + } + + SmallVector newTargetOffsets = targetOffsetsA; + SmallVector newTargetSizes = newSourceSizes; + SmallVector newTargetStrides = targetStridesA; + // Now we need to create a new L2 buffer based on `newTargetSizes`. + LogicalObjectFifoFromMemrefOp oldL2ObjectFifo = dmaOpA.getTargetObjectFifo(); + AMDAIE::LogicalObjectFifoFromMemrefOp newL2ObjectFifo = + createNewLogicalObjectFifo(rewriter, oldL2ObjectFifo, newTargetSizes); + + // Create combined L3->L2 Dma. + rewriter.setInsertionPoint(dmaOpA); + auto combinedL3ToL2DmaOp = rewriter.create( + dmaOpA.getLoc(), newL2ObjectFifo, llvm::ArrayRef(newTargetOffsets), + llvm::ArrayRef(newTargetSizes), llvm::ArrayRef(newTargetStrides), + dmaOpA.getSource(), llvm::ArrayRef(newSourceOffsets), + llvm::ArrayRef(newSourceSizes), llvm::ArrayRef(newSourceStrides)); + // Replace the uses of 2nd L3->L2 Dma with the new combined L3->L2 Dma + // and erase the 1st L3->L2 Dma. + rewriter.replaceOp(dmaOpB, combinedL3ToL2DmaOp); + rewriter.eraseOp(dmaOpA); + return newL2ObjectFifo; +} + +/// Utility comparator function that compares two DmaCpyNd ops `a` and `b`. +/// Returns true if `a`'s offset is "less" than `b`'s. The following explains +/// the notion of one offset being "less" than the other :- +/// Offset A : N-dimension array. +/// Offset B : N-dimension array. +/// Then, A < B if :- +/// A[i] < B[i] for `i` in [0, N-1] +/// AND A[0..i-1] == B[0..i-1] +static bool compareL3ToL2DmaPairOffsets(DmaCpyNdOp &a, DmaCpyNdOp &b) { + SmallVector sourceOffsetsA = a.getSourceMixedOffsets(); + SmallVector sourceSizesA = a.getSourceMixedSizes(); + SmallVector sourceOffsetsB = b.getSourceMixedOffsets(); + SmallVector sourceSizesB = b.getSourceMixedSizes(); + // We'll add assertion checks on the size before invoking this function. + for (int64_t i = 0, n = sourceOffsetsA.size(); i < n; i++) { + std::optional offsetA = getConstantIntValue(sourceOffsetsA[i]); + std::optional offsetB = getConstantIntValue(sourceOffsetsB[i]); + if (offsetA && offsetB) { + if (offsetA < offsetB) return true; + if (offsetA > offsetB) return false; + continue; + } + if (!offsetA && !offsetB) { + auto offsetValA = cast(sourceOffsetsA[i]); + auto offsetValB = cast(sourceOffsetsB[i]); + auto affineApplyOpA = dyn_cast_if_present( + offsetValA.getDefiningOp()); + auto affineApplyOpB = dyn_cast_if_present( + offsetValB.getDefiningOp()); + // TODO(avarma): This should be handled better. The overall possibility + // here already makes this complex enough. + assert(affineApplyOpA && "expected affine.apply op"); + assert(affineApplyOpB && "expected affine.apply op"); + for (auto &&[valA, valB] : + llvm::zip_equal(affineApplyOpA.getMapOperands(), + affineApplyOpB.getMapOperands())) { + assert((valA == valB) && + "different base values being operated on between the L3->L2 Dma " + "op pair"); + } + AffineMap affineMapA = affineApplyOpA.getAffineMap(); + AffineMap affineMapB = affineApplyOpB.getAffineMap(); + RetrieveScaleAndBias retrieverA, retrieverB; + assert(!failed(retrieverA.visit(affineMapA.getResult(0))) && + "failed to retrieve scale and bias"); + assert(!failed(retrieverB.visit(affineMapB.getResult(0))) && + "failed to retrieve scale and bias"); + int64_t biasA = 0, biasB = 0; + if (retrieverA.bias) { + biasA = retrieverA.bias.value(); + } + if (retrieverB.bias) { + biasB = retrieverB.bias.value(); + } + // TODO(avarma): We should also check the scale value as well. + if (biasA < biasB) return true; + if (biasA > biasB) return false; + continue; + } + assert(false && + "unexpected combination of offset val amongst L3->L2 Dma pair"); + } + return false; +} + +static bool areAccessPatternsCompatibleForCombining( + AMDAIE::DmaCpyNdOp &l3ToL2DmaOpA, AMDAIE::DmaCpyNdOp &l3ToL2DmaOpB) { + // Sources' access pattern check. + SmallVector sourceOffsetsA = + l3ToL2DmaOpA.getSourceMixedOffsets(); + SmallVector sourceSizesA = l3ToL2DmaOpA.getSourceMixedSizes(); + SmallVector sourceStridesA = + l3ToL2DmaOpA.getSourceMixedStrides(); + SmallVector sourceOffsetsB = + l3ToL2DmaOpB.getSourceMixedOffsets(); + SmallVector sourceSizesB = l3ToL2DmaOpB.getSourceMixedSizes(); + SmallVector sourceStridesB = + l3ToL2DmaOpB.getSourceMixedStrides(); + if (sourceOffsetsA.size() != sourceOffsetsB.size() || + sourceSizesA.size() != sourceSizesB.size() || + sourceStridesA.size() != sourceStridesB.size() || + sourceOffsetsA.size() != sourceSizesA.size() || + sourceOffsetsA.size() != sourceStridesB.size()) { + return false; + } + // Targets' access pattern check. + SmallVector targetOffsetsA = + l3ToL2DmaOpA.getTargetMixedOffsets(); + SmallVector targetSizesA = l3ToL2DmaOpA.getTargetMixedSizes(); + SmallVector targetStridesA = + l3ToL2DmaOpA.getTargetMixedStrides(); + SmallVector targetOffsetsB = + l3ToL2DmaOpB.getTargetMixedOffsets(); + SmallVector targetSizesB = l3ToL2DmaOpB.getTargetMixedSizes(); + SmallVector targetStridesB = + l3ToL2DmaOpB.getTargetMixedStrides(); + if (targetOffsetsA.size() != targetOffsetsB.size() || + targetSizesA.size() != targetSizesB.size() || + targetStridesA.size() != targetStridesB.size() || + targetOffsetsA.size() != targetSizesA.size() || + targetOffsetsA.size() != targetStridesB.size()) { + return false; + } + // Checking if targets' access pattern values are same. + auto isSameValue = [](SmallVector &accessPatternA, + SmallVector &accessPatternB) -> bool { + for (auto [a, b] : llvm::zip_equal(accessPatternA, accessPatternB)) { + if (a != b) return false; + } + return true; + }; + if (isSameValue(targetOffsetsA, targetOffsetsB) && + isSameValue(targetSizesA, targetSizesB) && + isSameValue(targetStridesA, targetStridesB)) { + return true; + } + + return false; +} + +static LogicalResult fetchCombiningDimensions( + SmallVector &l3ToL2DmaOps, + SmallVector &combiningDims, + SmallVector &nonCombiningDims) { + // Fetch combining/non-combining dimensions. Currently we look for a + // continuous sequence of 0 offset dims with size as 1 to infer them as + // combining dimensions. + int64_t maxCombiningDimIndex = 0; + for (unsigned i = 0, n = l3ToL2DmaOps.size(); i < n; i++) { + SmallVector sourceOffsets = + l3ToL2DmaOps[i].getSourceMixedOffsets(); + SmallVector sourceSizes = + l3ToL2DmaOps[i].getSourceMixedSizes(); + unsigned j = 0, m = sourceOffsets.size(); + // Traverse through the i-th L3->L2 Dma op's source offset/size to find a + // continuous sequence of 0 offset dims with size as 1. + while (j < m) { + std::optional constantOffset = + getConstantIntValue(sourceOffsets[j]); + if (!constantOffset || constantOffset.value() != 0) { + break; + } + std::optional constantSize = getConstantIntValue(sourceSizes[j]); + if (!constantSize || constantSize.value() != 1) { + break; + } + j++; + } + if (i == 0) { + maxCombiningDimIndex = j; + } else if (maxCombiningDimIndex != j) { + LLVM_DEBUG(llvm::dbgs() + << "incompatible combining dimensions across L3->L2\n"); + return failure(); + } + } + combiningDims.assign(maxCombiningDimIndex, 0); + std::iota(combiningDims.begin(), combiningDims.end(), 0); + nonCombiningDims.assign(maxCombiningDimIndex, 0); + std::iota(nonCombiningDims.begin(), nonCombiningDims.end(), + combiningDims.size()); + return success(); +} + +/// Given a vector of L2->L1 Dma Ops, combine the corresponding L3->L2 Dma Ops +/// and reuse the L2/L1 buffers. +/// TODO(avarma): Assign combined tiles while forming L2/L1 buffers which we'll +/// reuse. +LogicalResult combineLogicalObjectFifos( + IRRewriter &rewriter, SmallVector &l2ToL1DmaOps, + MLIRContext *context) { + if (l2ToL1DmaOps.size() == 0) return success(); + + // Fetch the L3 -> L2 Dma Op corresponding to the first L2 buffer as target. + SmallVector l3ToL2DmaOps; + FailureOr maybeL3ToL2DmaOp = + fetchL3ToL2DmaCpyNdOp(l2ToL1DmaOps[0]); + if (failed(maybeL3ToL2DmaOp)) return failure(); + l3ToL2DmaOps.push_back(maybeL3ToL2DmaOp.value()); + + // Check that all L3 buffer associated with the different L3->L2 Dma ops are + // same. + for (unsigned i = 1, n = l2ToL1DmaOps.size(); i < n; i++) { + maybeL3ToL2DmaOp = fetchL3ToL2DmaCpyNdOp(l2ToL1DmaOps[i]); + if (failed(maybeL3ToL2DmaOp)) return failure(); + l3ToL2DmaOps.push_back(maybeL3ToL2DmaOp.value()); + if (l3ToL2DmaOps[0].getSourceObjectFifo() != + l3ToL2DmaOps[i].getSourceObjectFifo()) { + LLVM_DEBUG(llvm::dbgs() + << "Found different L3 objectFifo for " << l3ToL2DmaOps[0] + << " and " << l3ToL2DmaOps[i] << "\n"); + return failure(); + } + if (!areAccessPatternsCompatibleForCombining(l3ToL2DmaOps[0], + l3ToL2DmaOps[i])) { + LLVM_DEBUG( + llvm::dbgs() + << "access patterns failed compatibility checks for combining\n"); + return failure(); + } + } + + if (l2ToL1DmaOps.size() != l3ToL2DmaOps.size()) { + LLVM_DEBUG( + llvm::dbgs() + << "expected 1:1 correspondence between L3->L2 and L2->L1 Dma ops\n"); + return failure(); + } + + SmallVector combiningDims, nonCombiningDims; + if (failed(fetchCombiningDimensions(l3ToL2DmaOps, combiningDims, + nonCombiningDims))) { + return failure(); + } + + // At this point it's nice to perhaps just sort the L3->L2 Dma ops based on + // the "overlapping" offsets. And we'll sort the corresponding L2->L1 Dma ops + // accordingly. + for (int64_t i = 1, n = l3ToL2DmaOps.size(); i < n; i++) { + DmaCpyNdOp currL3ToL2DmaOp = l3ToL2DmaOps[i]; + DmaCpyNdOp currL2ToL1DmaOp = l2ToL1DmaOps[i]; + int64_t j = i - 1; + while (j >= 0 && + compareL3ToL2DmaPairOffsets(currL3ToL2DmaOp, l3ToL2DmaOps[j])) { + l3ToL2DmaOps[j + 1] = l3ToL2DmaOps[j]; + l2ToL1DmaOps[j + 1] = l2ToL1DmaOps[j]; + j--; + } + l3ToL2DmaOps[j + 1] = currL3ToL2DmaOp; + l2ToL1DmaOps[j + 1] = currL2ToL1DmaOp; + } + + // Currently we have 4 cores so there are two pairs of DmaCpyNds to combine. + // TODO(avarma): Revisit this later when we want to target more no. of cores. + if (l3ToL2DmaOps.size() % 2 != 0) { + LLVM_DEBUG(llvm::dbgs() << "found uneven L3->L2 ops for combining\n"); + return failure(); + } + + auto createL2ToL1ForReuse = + [](IRRewriter &rewriter, DmaCpyNdOp &l2ToL1DmaOp, + LogicalObjectFifoFromMemrefOp &reuseL1Buffer, + LogicalObjectFifoFromMemrefOp &reuseL2Buffer, + SmallVector &newL2SourceOffsets) -> DmaCpyNdOp { + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(l2ToL1DmaOp); + auto newL2ToL1DmaOp = rewriter.create( + l2ToL1DmaOp.getLoc(), reuseL1Buffer, + l2ToL1DmaOp.getTargetMixedOffsets(), l2ToL1DmaOp.getTargetMixedSizes(), + l2ToL1DmaOp.getTargetMixedStrides(), reuseL2Buffer, + llvm::ArrayRef(newL2SourceOffsets), l2ToL1DmaOp.getSourceMixedSizes(), + l2ToL1DmaOp.getSourceMixedStrides()); + rewriter.replaceOp(l2ToL1DmaOp, newL2ToL1DmaOp); + return newL2ToL1DmaOp; + }; + // Till this point, we've created a L3->L2 DmaCpyNd chain that is sorted based + // on increasing offsets. Refer to `compareL3ToL2DmaPairOffsets`'s doc comment + // for the same. Now, we'll be picking up pairs of such DmaCpyNd ops from the + // chain, eg: pair[0,1], then pair[2,3], etc. For each such pair[i, i+1] we + // will make an attempt to combine the logical objectFifos as per the + // following algorithm :- + // a. Combine i-th and i+1-th L3->L2 DmaCpyNd ops. + // b. Form reusable L1 buffer by assigning the cumulative tiles of the + // intended core ops. + // c. Since step a would create a new L2 buffer (with combined shape), we + // will need to update the corresponding two L2->L1 Dma ops by indeed + // creating new ones. NOTE: Both of these new L2->L1 Dma ops will be + // reusing the same L1 buffers as well. + // d. Now pick the unique core ops corresponding to i-th and i+1-th L2->L1 + // Dma ops and do the following :- + // 1. For i-th CoreOp insert an AccessOp from the same L1 buffer towards + // the end. + // 2. For i+1-th CoreOp insert an AccessOp from the same L1 buffer right + // before the corresponding AccessOp within the same CoreOp. + for (unsigned i = 0, n = l3ToL2DmaOps.size(); i < n; i += 2) { + // Step 1. Combine the picked L3->L2 DmaCpyNd pair. + FailureOr maybeNewL2ObjectFifo = + combineL3ToL2Pair(rewriter, l3ToL2DmaOps[i], l3ToL2DmaOps[i + 1], + combiningDims, nonCombiningDims); + if (failed(maybeNewL2ObjectFifo)) return failure(); + LogicalObjectFifoFromMemrefOp newL2ObjectFifo = + maybeNewL2ObjectFifo.value(); + + LogicalObjectFifoFromMemrefOp oldFirstL1ObjFifoOp = + l2ToL1DmaOps[i].getTargetObjectFifo(); + LogicalObjectFifoFromMemrefOp oldSecondL1ObjFifoOp = + l2ToL1DmaOps[i + 1].getTargetObjectFifo(); + // Step 2. Form the reusable L1 buffer by assigning the cumulative tiles of + // the intended core ops. + LogicalObjectFifoFromMemrefOp reuseL1LogicalObjectFifoOp = + l2ToL1DmaOps[i].getTargetObjectFifo(); + SmallVector tiles; + auto addNewTileFrom = [&](CoreOp coreOp) -> LogicalResult { + OpBuilder::InsertionGuard guard(rewriter); + TileOp tileOp = coreOp.getTileOp(); + std::optional column = getConstantIntValue(tileOp.getCol()); + std::optional row = getConstantIntValue(tileOp.getRow()); + if (!column || !row) { + return coreOp.emitOpError() << "has non-constant tile location"; + } + rewriter.setInsertionPoint(reuseL1LogicalObjectFifoOp); + auto colIndex = rewriter.create( + rewriter.getUnknownLoc(), *column); + auto rowIndex = rewriter.create( + rewriter.getUnknownLoc(), *row); + tileOp = + rewriter.create(rewriter.getUnknownLoc(), colIndex, rowIndex); + tiles.push_back(tileOp.getResult()); + return success(); + }; + std::optional maybeFirstCoreOp = fetchUniqueCoreOp(l2ToL1DmaOps[i]); + if (!maybeFirstCoreOp) return failure(); + CoreOp firstCoreOp = maybeFirstCoreOp.value(); + std::optional maybeSecondCoreOp = + fetchUniqueCoreOp(l2ToL1DmaOps[i + 1]); + if (!maybeSecondCoreOp) return failure(); + CoreOp secondCoreOp = maybeSecondCoreOp.value(); + if (failed(addNewTileFrom(firstCoreOp)) || + failed(addNewTileFrom(secondCoreOp))) { + return failure(); + } + llvm::sort(tiles.begin(), tiles.end(), + AMDAIE::TileOp::tileValueColumnAndRowComparator); + rewriter.setInsertionPoint(reuseL1LogicalObjectFifoOp); + reuseL1LogicalObjectFifoOp = rewriter.create( + reuseL1LogicalObjectFifoOp.getLoc(), + cast( + reuseL1LogicalObjectFifoOp.getOutput().getType()), + reuseL1LogicalObjectFifoOp.getMemref(), tiles); + + // Step 3. We now have need to create two L2->L1 ops since the size has + // changed. But for this we first need to find the new offset for L2 as + // source. + // TODO: For now I'm hardcoding the offsets but later it'd just depend + // on combining/non-combining dimensions. + // Offset = 0,0 + SmallVector newL2AsSourceOffsets = + l2ToL1DmaOps[i].getSourceMixedOffsets(); + createL2ToL1ForReuse(rewriter, l2ToL1DmaOps[i], reuseL1LogicalObjectFifoOp, + newL2ObjectFifo, newL2AsSourceOffsets); + // Offset = 0, 1. NOTE here we'd use the same L1 logical objectFifo as + // the first L2->L1 Dma. + newL2AsSourceOffsets = l2ToL1DmaOps[i + 1].getSourceMixedOffsets(); + newL2AsSourceOffsets[1] = rewriter.getIndexAttr(1); + createL2ToL1ForReuse(rewriter, l2ToL1DmaOps[i + 1], + reuseL1LogicalObjectFifoOp, newL2ObjectFifo, + newL2AsSourceOffsets); + + // Step 4. Pick the CoreOps associated with the 1:1 L2->L1. + // For the first Core op we'll insert Read at the end. It doesn't matter + // for now so we're gonna insert it right before amdaie.end op. + firstCoreOp.walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) { + if (accessOp.getInput() == oldFirstL1ObjFifoOp) { + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPointAfter(accessOp); + rewriter.create( + rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(), + accessOp.getAccessType()); + } + }); + rewriter.replaceOp(oldFirstL1ObjFifoOp, reuseL1LogicalObjectFifoOp); + // For the second Core op we'll insert `Read` right before the first read + // from the corresponding L1 logicalobjectFifo. + secondCoreOp.walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) { + if (accessOp.getInput() == oldSecondL1ObjFifoOp) { + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(accessOp); + rewriter.create( + rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(), + accessOp.getAccessType()); + // Need to insert the second one because THIS is what will actually + // be used. + auto secondAccessOp = + rewriter.create( + rewriter.getUnknownLoc(), + reuseL1LogicalObjectFifoOp.getOutput(), + accessOp.getAccessType()); + rewriter.replaceOp(accessOp, secondAccessOp); + } + }); + } + + return success(); +} + } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h index f9339b2ac..f0ed234f1 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h @@ -19,6 +19,11 @@ LogicalResult splitLogicalObjectFifos( IRRewriter &rewriter, SmallVector &l2ToL1DmaOps, MLIRContext *context); +/// Utility to combine logicalobjectfifos given a vector of L2->L1 dma ops. +LogicalResult combineLogicalObjectFifos( + IRRewriter &rewriter, SmallVector &l2ToL1DmaOps, + MLIRContext *context); + } // namespace mlir::iree_compiler::AMDAIE #endif diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index 2979c71ef..ca7fc9bd5 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -53,6 +53,7 @@ iree_cc_library( "AMDAIECanonicalizeDma.cpp" "AMDAIECanonicalizeNpuDmaCpyNd.cpp" "AMDAIECanonicalizeDoublyStridedOp.cpp" + "AMDAIECombineLogicalObjFifosForConnectionReuse.cpp" "AMDAIECombineStridedOps.cpp" "AMDAIEControlCodeLoopUnroll.cpp" "AMDAIEConvertCoreForallToFor.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 8912db52d..46793bc34 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -31,6 +31,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIECANONICALIZEDOUBLYSTRIDEDOP #define GEN_PASS_DEF_AMDAIECANONICALIZENPUDMACPYND #define GEN_PASS_DEF_AMDAIECLEANUP +#define GEN_PASS_DEF_AMDAIECOMBINELOGICALOBJFIFOSFORCONNECTIONREUSE #define GEN_PASS_DEF_AMDAIECOMBINESTRIDEDOPS #define GEN_PASS_DEF_AMDAIECONTROLCODELOOPUNROLL #define GEN_PASS_DEF_AMDAIECONVERTCOREFORALLTOFOR diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index fe5670067..585af50b4 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -87,6 +87,10 @@ std::unique_ptr createAMDAIECanonicalizeNpuDmaCpyNdPass(); std::unique_ptr createAMDAIECanonicalizeDoublyStridedOpPass( AMDAIECanonicalizeDoublyStridedOpOptions options = {}); +/// Create a pass to combine logicalobjectfifos for connection reuse. +std::unique_ptr +createAMDAIECombineLogicalObjFifosForConnectionReusePass(); + /// Pass to unroll the loops within the control code regions. std::unique_ptr createAMDAIEControlCodeLoopUnrollPass(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index 73ceee040..19f3a2608 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -123,6 +123,12 @@ def AMDAIECleanup : "mlir::iree_compiler::AMDAIE::createAMDAIECleanupPass()"; } +def AMDAIECombineLogicalObjFifosForConnectionReuse : + Pass<"iree-amdaie-combine-logical-objectfifos-for-connection-reuse", "ModuleOp"> { + let summary = "Pass to combine L2 buffers to share inputs of Matmul and Elementwise operations."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIECombineLogicalObjFifosForConnectionReusePass()"; +} + def AMDAIECombineStridedOps : Pass<"iree-amdaie-combine-strided-ops", ""> { let summary = "Combine strided ops in same block if access patterns are compatible."; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index ba4380860..261a8068c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -19,6 +19,7 @@ iree_lit_test_suite( "canonicalize_dma.mlir" "canonicalize_doubly_strided_op.mlir" "canonicalize_npu_dma_cpy_nd.mlir" + "combine_logicalobjfifos_for_connection_reuse.mlir" "combine_strided_ops.mlir" "controlcode_loop_unrolling.mlir" "convert_core_forall_to_for.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir new file mode 100644 index 000000000..a493efdce --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir @@ -0,0 +1,219 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-combine-logical-objectfifos-for-connection-reuse,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s + +// CHECK-DAG: #map = affine_map<(d0) -> (d0 * 64)> +// CHECK-DAG: #map1 = affine_map<(d0) -> (d0 * 64 + 32)> +// CHECK: @combine_logical_objFifos +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK: memref.alloc() : memref<1x2x32x32xi32, 1 : i32> +// CHECK: %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> +// CHECK: %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> +// CHECK: %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32> +// CHECK-DAG: %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> +// CHECK-DAG: %[[TILE_0:.*]] = amdaie.tile(%[[C1]], %[[C3]]) +// CHECK-DAG: %[[TILE_1:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: %[[TILE_2:.*]] = amdaie.tile(%[[C1]], %[[C2]]) +// CHECK-DAG: %[[TILE_3:.*]] = amdaie.tile(%[[C0]], %[[C3]]) +// CHECK: %[[L2_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_0]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L2_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_1]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} : +// CHECK-SAME: memref<128x128xi32> -> !amdaie.logicalobjectfifo> +// CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2) +// CHECK-DAG: %[[IV1_0:.*]] = affine.apply #map(%[[IV1]]) +// CHECK-DAG: %[[IV0_0:.*]] = affine.apply #map(%[[IV0]]) +// CHECK-DAG: %[[IV0_32:.*]] = affine.apply #map1(%[[IV0]]) +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_0]], %[[IV1_0]]] [1, 2, 32, 32] [4096, 32, 128, 1] +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32]], %[[IV1_0]]] [1, 2, 32, 32] [4096, 32, 128, 1] +// CHECK: %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]], %[[TILE_0]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out : +// CHECK: linalg.generic +// CHECK: %[[FIRST_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read) +// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read) +// CHECK: linalg.generic +// CHECK-SAME: %[[FIRST_READ]] +// CHECK: amdaie.end +// CHECK: } +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out : +// CHECK: linalg.generic +// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read) +// CHECK: %[[SECOND_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read) +// CHECK: linalg.generic +// CHECK-SAME: %[[SECOND_READ]] +// CHECK: amdaie.end +// CHECK: } +// CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_3]], %[[TILE_2]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_2:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_2]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_2]]], out : +// CHECK: linalg.generic +// CHECK: %[[FIRST_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read) +// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read) +// CHECK: linalg.generic +// CHECK-SAME: %[[FIRST_READ]] +// CHECK: amdaie.end +// CHECK: } +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_3:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_3]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_3]]], out : +// CHECK: linalg.generic +// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read) +// CHECK: %[[SECOND_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read) +// CHECK: linalg.generic +// CHECK-SAME: %[[SECOND_READ]] +// CHECK: amdaie.end +// CHECK: } +#map = affine_map<(d0) -> (d0 * 64)> +#map1 = affine_map<(d0) -> (d0 * 64 + 32)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> +#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)> +#map4 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)> +#map5 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)> +module { + func.func @combine_logical_objFifos(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { + %c3 = arith.constant 3 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> + %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> + %alloc_1 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> + %alloc_2 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> + %alloc_3 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> + %alloc_4 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> + %alloc_5 = memref.alloc() : memref<128x128xi32> + %alloc_6 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %tile = amdaie.tile(%c1, %c3) + %tile_7 = amdaie.tile(%c0, %c2) + %tile_8 = amdaie.tile(%c1, %c2) + %tile_9 = amdaie.tile(%c0, %c3) + %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %2 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %4 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg4, %arg5) in (2, 2) { + %5 = affine.apply #map(%arg5) + %6 = affine.apply #map1(%arg5) + %7 = affine.apply #map(%arg4) + %8 = affine.apply #map1(%arg4) + %9 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %7, %5] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %10 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %7, %6] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %11 = amdaie.dma_cpy_nd(%2[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %8, %5] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %12 = amdaie.dma_cpy_nd(%3[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %8, %6] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %13 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %14 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %15 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %13[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %16 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %14[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %17 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %18 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_7} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %19 = amdaie.dma_cpy_nd(%18[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %20 = amdaie.core(%tile_7, in : [%15, %16, %19], out : [%17]) { + %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.muli %in, %in_10 : i32 + %36 = arith.addi %out, %35 : i32 + linalg.yield %36 : i32 + } + %33 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.addi %in, %in_10 : i32 + linalg.yield %35 : i32 + } + amdaie.end + } + %21 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %22 = amdaie.dma_cpy_nd(%21[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %1[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %23 = amdaie.core(%tile, in : [%15, %16, %22], out : [%17]) { + %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.muli %in, %in_10 : i32 + %36 = arith.addi %out, %35 : i32 + linalg.yield %36 : i32 + } + %33 = amdaie.logicalobjectfifo.access(%21, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.addi %in, %in_10 : i32 + linalg.yield %35 : i32 + } + amdaie.end + } + %24 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_8} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %25 = amdaie.dma_cpy_nd(%24[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %26 = amdaie.core(%tile_8, in : [%15, %16, %25], out : [%17]) { + %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.muli %in, %in_10 : i32 + %36 = arith.addi %out, %35 : i32 + linalg.yield %36 : i32 + } + %33 = amdaie.logicalobjectfifo.access(%24, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.addi %in, %in_10 : i32 + linalg.yield %35 : i32 + } + amdaie.end + } + %27 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_9} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %28 = amdaie.dma_cpy_nd(%27[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %3[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %29 = amdaie.core(%tile_9, in : [%15, %16, %28], out : [%17]) { + %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.muli %in, %in_10 : i32 + %36 = arith.addi %out, %35 : i32 + linalg.yield %36 : i32 + } + %33 = amdaie.logicalobjectfifo.access(%27, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.addi %in, %in_10 : i32 + linalg.yield %35 : i32 + } + amdaie.end + } + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32> + memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_5 : memref<128x128xi32> + memref.dealloc %alloc_1 : memref<1x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_2 : memref<1x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_3 : memref<1x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_4 : memref<1x1x32x32xi32, 1 : i32> + return + } +}