From be4f6f9dc53f22d4794d0ae11e4ba0dd49f85ab2 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Mon, 9 Sep 2024 12:27:53 +0000 Subject: [PATCH] Final before PR review begins --- .../AMDAIELogicalObjFifoSplittingUtils.cpp | 360 +++++++++--------- .../Transforms/test/CMakeLists.txt | 1 + ..._logicalobjfifos_for_connection_reuse.mlir | 219 +++++++++++ 3 files changed, 410 insertions(+), 170 deletions(-) create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp index 42293ff66..de00f04f8 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp @@ -476,7 +476,25 @@ LogicalResult splitLogicalObjectFifos( return success(); } -static LogicalResult _TODOcombineAccessPatterns( +static int64_t fetchOffsetBias(OpFoldResult offsetOpFoldResult) { + std::optional offset = getConstantIntValue(offsetOpFoldResult); + if (offset) return offset.value(); + auto offsetVal = cast(offsetOpFoldResult); + auto affineApplyOp = + dyn_cast_if_present(offsetVal.getDefiningOp()); + if (!affineApplyOp) return 0; + AffineMap affineMap = affineApplyOp.getAffineMap(); + RetrieveScaleAndBias retriever; + assert(!failed(retriever.visit(affineMap.getResult(0))) && + "failed to retrieve scale and bias"); + int64_t bias = 0; + if (retriever.bias) { + bias = retriever.bias.value(); + } + return bias; +} + +static LogicalResult combineL3ToL2AccessPatterns( RewriterBase &rewriter, const SmallVector &offsetsA, const SmallVector &sizesA, const SmallVector &stridesA, @@ -484,32 +502,80 @@ static LogicalResult _TODOcombineAccessPatterns( const SmallVector &sizesB, const SmallVector &stridesB, SmallVector &newOffsets, SmallVector &newSizes, - SmallVector &newStrides) { - // TODO: Move these checks later in a separate func. - assert(offsetsA.size() == offsetsB.size() && - "expected same number of source offsets and target offsets"); - assert(offsetsA.size() == sizesA.size() && - "expected same number of source offsets and sizes"); - assert(offsetsA.size() == stridesA.size() && - "expected same number of source offsets and strides"); - assert(offsetsB.size() == sizesB.size() && - "expected same number of target offsets and sizes"); - assert(offsetsB.size() == stridesB.size() && - "expected same number of target offsets and strides"); - + SmallVector &newStrides, SmallVector &splitDims, + SmallVector &nonSplitDims) { if (offsetsA.empty() && offsetsB.empty()) return success(); + int64_t newSize = 1; for (auto iter : llvm::enumerate(llvm::zip(offsetsA, offsetsB))) { + if (iter.index() < splitDims.size()) continue; const OpFoldResult &offsetA = std::get<0>(iter.value()); const OpFoldResult &offsetB = std::get<1>(iter.value()); if (offsetA != offsetB) { // Need to check the difference in bias here. + int64_t biasA = fetchOffsetBias(offsetA); + int64_t biasB = fetchOffsetBias(offsetB); + std::optional sizeA = getConstantIntValue(sizesA[iter.index()]); + assert(sizeA && "expected a constant integer value for size"); + assert((sizeA == biasB - biasA) && + "L3->L2 pair cannot be combined because offset is not contiguous"); + newSize++; } } - newSizes[1] = rewriter.getI64IntegerAttr(2); + newSizes[splitDims.size() - 1] = rewriter.getI64IntegerAttr(newSize); return success(); } +static FailureOr combineL3ToL2Pair( + IRRewriter &rewriter, DmaCpyNdOp dmaOpA, DmaCpyNdOp dmaOpB, + SmallVector &splitDims, SmallVector &nonSplitDims) { + OpBuilder::InsertionGuard guard(rewriter); + SmallVector sourceOffsetsA = dmaOpA.getSourceMixedOffsets(); + SmallVector sourceSizesA = dmaOpA.getSourceMixedSizes(); + SmallVector sourceStridesA = dmaOpA.getSourceMixedStrides(); + SmallVector sourceOffsetsB = dmaOpB.getSourceMixedOffsets(); + SmallVector sourceSizesB = dmaOpB.getSourceMixedSizes(); + SmallVector sourceStridesB = dmaOpB.getSourceMixedStrides(); + + SmallVector targetOffsetsA = dmaOpA.getTargetMixedOffsets(); + SmallVector targetSizesA = dmaOpA.getTargetMixedSizes(); + SmallVector targetStridesA = dmaOpA.getTargetMixedStrides(); + SmallVector targetOffsetsB = dmaOpB.getTargetMixedOffsets(); + SmallVector targetSizesB = dmaOpB.getTargetMixedSizes(); + SmallVector targetStridesB = dmaOpB.getTargetMixedStrides(); + + SmallVector newSourceOffsets = sourceOffsetsA; + SmallVector newSourceSizes = sourceSizesA; + SmallVector newSourceStrides = sourceStridesA; + if (failed(combineL3ToL2AccessPatterns( + rewriter, sourceOffsetsA, sourceSizesA, sourceStridesA, + sourceOffsetsB, sourceSizesB, sourceStridesB, newSourceOffsets, + newSourceSizes, newSourceStrides, splitDims, nonSplitDims))) { + return failure(); + } + + SmallVector newTargetOffsets = targetOffsetsA; + SmallVector newTargetSizes = newSourceSizes; + SmallVector newTargetStrides = targetStridesA; + // Now we need to create a new L2 buffer based on `newTargetSizes`. + LogicalObjectFifoFromMemrefOp oldL2ObjectFifo = dmaOpA.getTargetObjectFifo(); + AMDAIE::LogicalObjectFifoFromMemrefOp newL2ObjectFifo = + createNewLogicalObjectFifo(rewriter, oldL2ObjectFifo, newTargetSizes); + + // Create combined L3->L2 Dma. + rewriter.setInsertionPoint(dmaOpA); + auto combinedL3ToL2DmaOp = rewriter.create( + dmaOpA.getLoc(), newL2ObjectFifo, llvm::ArrayRef(newTargetOffsets), + llvm::ArrayRef(newTargetSizes), llvm::ArrayRef(newTargetStrides), + dmaOpA.getSource(), llvm::ArrayRef(newSourceOffsets), + llvm::ArrayRef(newSourceSizes), llvm::ArrayRef(newSourceStrides)); + // Replace the uses of 2nd L3->L2 Dma with the new combined L3->L2 Dma + // and erase the 1st L3->L2 Dma. + rewriter.replaceOp(dmaOpB, combinedL3ToL2DmaOp); + rewriter.eraseOp(dmaOpA); + return newL2ObjectFifo; +} + /// Utility to fetch a unique CoreOp associated with a L2->L1 Dma op. static CoreOp fetchUniqueCoreOp(DmaCpyNdOp &l2ToL1DmaOp) { SmallVector coreOps; @@ -580,6 +646,32 @@ static bool compareL3ToL2DmaPair(DmaCpyNdOp &a, DmaCpyNdOp &b) { return false; } +static LogicalResult checkIfSameDimensionalityAccessPatterns( + AMDAIE::DmaCpyNdOp &l3ToL2DmaOpA, AMDAIE::DmaCpyNdOp &l3ToL2DmaOpB) { + SmallVector sourceOffsetsA = + l3ToL2DmaOpA.getSourceMixedOffsets(); + SmallVector sourceSizesA = l3ToL2DmaOpA.getSourceMixedSizes(); + SmallVector sourceStridesA = + l3ToL2DmaOpA.getSourceMixedStrides(); + SmallVector sourceOffsetsB = + l3ToL2DmaOpB.getSourceMixedOffsets(); + SmallVector sourceSizesB = l3ToL2DmaOpB.getSourceMixedSizes(); + SmallVector sourceStridesB = + l3ToL2DmaOpB.getSourceMixedStrides(); + if (sourceOffsetsA.size() != sourceOffsetsB.size() || + sourceSizesA.size() != sourceSizesB.size() || + sourceStridesA.size() != sourceStridesB.size() || + sourceOffsetsA.size() != sourceSizesA.size() || + sourceOffsetsA.size() != sourceStridesB.size()) { + return failure(); + } + return success(); +} + +/// Given a vector of L2->L1 Dma Ops, combine the corresponding L3->L2 Dma Ops +/// and reuse the L2/L1 buffers. +/// TODO(avarma): Assign combined tiles while forming L2/L1 buffers which we'll +/// reuse. LogicalResult combineLogicalObjectFifos( IRRewriter &rewriter, SmallVector &l2ToL1DmaOps, MLIRContext *context) { @@ -605,6 +697,12 @@ LogicalResult combineLogicalObjectFifos( << " and " << l3ToL2DmaOps[i] << "\n"); return failure(); } + if (failed(checkIfSameDimensionalityAccessPatterns(l3ToL2DmaOps[0], + l3ToL2DmaOps[i]))) { + LLVM_DEBUG(llvm::dbgs() + << "Found different dimensionality of access patterns\n"); + return failure(); + } } if (l2ToL1DmaOps.size() != l3ToL2DmaOps.size()) { @@ -647,9 +745,9 @@ LogicalResult combineLogicalObjectFifos( return failure(); } } - SmallVector splitDims(maxSplitDimIndex + 1); + SmallVector splitDims(maxSplitDimIndex); std::iota(splitDims.begin(), splitDims.end(), 0); - SmallVector nonSplitDims(maxSplitDimIndex + 1); + SmallVector nonSplitDims(maxSplitDimIndex); std::iota(nonSplitDims.begin(), nonSplitDims.end(), splitDims.size()); // At this point it's nice to perhaps just sort the L3->L2 Dma ops based on @@ -668,171 +766,93 @@ LogicalResult combineLogicalObjectFifos( l2ToL1DmaOps[j + 1] = currL2ToL1DmaOp; } - for (auto x : l3ToL2DmaOps) { - llvm::outs() << "===> " << x << "\n"; - llvm::outs().flush(); - } - // For now pick the first two L3->L2 Dma op and try to combine them. Later - // we'll implement the selector. - //////////////////////////////////////////////// - ////////////// PICK logic TODO ///////////////// - //////////////////////////////////////////////// // Currently we have 4 cores so there are two pairs of DmaCpyNds to combine. // TODO(avarma): Revisit this later when we want to target more no. of cores. - if (l3ToL2DmaOps.size() != 4) { + if (l3ToL2DmaOps.size() % 2 == 0) { LLVM_DEBUG(llvm::dbgs() - << "currently only 4 L3->L2 ops are supported for combining\n"); + << "found uneven L3->L2 ops for combining\n"); return failure(); } + + auto createL2ToL1ForReuse = + [](IRRewriter &rewriter, DmaCpyNdOp &l2ToL1DmaOp, + LogicalObjectFifoFromMemrefOp &reuseL1Buffer, + LogicalObjectFifoFromMemrefOp &reuseL2Buffer, + SmallVector &newL2SourceOffsets) -> DmaCpyNdOp { + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(l2ToL1DmaOp); + auto newL2ToL1DmaOp = rewriter.create( + l2ToL1DmaOp.getLoc(), reuseL1Buffer, + l2ToL1DmaOp.getTargetMixedOffsets(), l2ToL1DmaOp.getTargetMixedSizes(), + l2ToL1DmaOp.getTargetMixedStrides(), reuseL2Buffer, + llvm::ArrayRef(newL2SourceOffsets), l2ToL1DmaOp.getSourceMixedSizes(), + l2ToL1DmaOp.getSourceMixedStrides()); + rewriter.replaceOp(l2ToL1DmaOp, newL2ToL1DmaOp); + return newL2ToL1DmaOp; + }; for (unsigned i = 0, n = l3ToL2DmaOps.size(); i < n; i += 2) { - auto op = l3ToL2DmaOps[i]; - auto nextStridedOp = l3ToL2DmaOps[i + 1]; - //////////////////////////////////////////////// - /////// COMBINE the picked L3->L2 pair ///////// - //////////////////////////////////////////////// - { + // Step 1. Combine the picked L3->L2 DmaCpyNd pair. + FailureOr maybeNewL2ObjectFifo = + combineL3ToL2Pair(rewriter, l3ToL2DmaOps[i], l3ToL2DmaOps[i + 1], + splitDims, nonSplitDims); + if (failed(maybeNewL2ObjectFifo)) return failure(); + LogicalObjectFifoFromMemrefOp newL2ObjectFifo = + maybeNewL2ObjectFifo.value(); + + // Step 2. We now have need to create two L2->L1 ops since the size has + // changed. But for this we first need to find the new offset for L2 as + // source. + // TODO: For now I'm hardcoding the offsets but later it'd just depend + // on split/non-split dimensions. + // Offset = 0,0 + LogicalObjectFifoFromMemrefOp reuseL1LogicalObjectFifoOp = + l2ToL1DmaOps[i].getTargetObjectFifo(); + SmallVector newL2AsSourceOffsets = + l2ToL1DmaOps[i].getSourceMixedOffsets(); + DmaCpyNdOp newFirstL2ToL1DmaOp = createL2ToL1ForReuse( + rewriter, l2ToL1DmaOps[i], reuseL1LogicalObjectFifoOp, newL2ObjectFifo, + newL2AsSourceOffsets); + // Offset = 0, 1. NOTE here we'd use the same L1 logical objectFifo as + // the first L2->L1 Dma. + newL2AsSourceOffsets = l2ToL1DmaOps[i + 1].getSourceMixedOffsets(); + newL2AsSourceOffsets[1] = rewriter.getIndexAttr(1); + DmaCpyNdOp newSecondL2ToL1DmaOp = createL2ToL1ForReuse( + rewriter, l2ToL1DmaOps[i + 1], reuseL1LogicalObjectFifoOp, + newL2ObjectFifo, newL2AsSourceOffsets); + + // Step 3. PICK the CoreOps associated with the 1:1 L2->L1. + // For the first Core op we'll insert Read at the end. It doesn't matter + // for now so we're gonna insert it right before amdaie.end op. + CoreOp firstCoreOp = fetchUniqueCoreOp(newFirstL2ToL1DmaOp); + firstCoreOp.walk([&](AMDAIE::EndOp endOp) { OpBuilder::InsertionGuard guard(rewriter); - SmallVector sourceOffsetsA = op.getSourceMixedOffsets(); - SmallVector sourceSizesA = op.getSourceMixedSizes(); - SmallVector sourceStridesA = op.getSourceMixedStrides(); - SmallVector sourceOffsetsB = - nextStridedOp.getSourceMixedOffsets(); - SmallVector sourceSizesB = - nextStridedOp.getSourceMixedSizes(); - SmallVector sourceStridesB = - nextStridedOp.getSourceMixedStrides(); - bool areSourcesCombinable = true; - - SmallVector targetOffsetsA = op.getTargetMixedOffsets(); - SmallVector targetSizesA = op.getTargetMixedSizes(); - SmallVector targetStridesA = op.getTargetMixedStrides(); - SmallVector targetOffsetsB = - nextStridedOp.getTargetMixedOffsets(); - SmallVector targetSizesB = - nextStridedOp.getTargetMixedSizes(); - SmallVector targetStridesB = - nextStridedOp.getTargetMixedStrides(); - bool areTargetsCombinable = true; - - if (areSourcesCombinable && areTargetsCombinable) { - SmallVector newSourceOffsets = sourceOffsetsA; - SmallVector newSourceSizes = sourceSizesA; - SmallVector newSourceStrides = sourceStridesA; - if (failed(_TODOcombineAccessPatterns( - rewriter, sourceOffsetsA, sourceSizesA, sourceStridesA, - sourceOffsetsB, sourceSizesB, sourceStridesB, newSourceOffsets, - newSourceSizes, newSourceStrides))) { - return failure(); - } - llvm::outs() << "Combined sources\n"; - llvm::outs().flush(); - - SmallVector newTargetOffsets = targetOffsetsA; - SmallVector newTargetSizes = targetSizesA; - SmallVector newTargetStrides = targetStridesA; - if (failed(_TODOcombineAccessPatterns( - rewriter, targetOffsetsA, targetSizesA, targetStridesA, - targetOffsetsB, targetSizesB, targetStridesB, newTargetOffsets, - newTargetSizes, newTargetStrides))) { - return failure(); - } - llvm::outs() << "Combined target\n"; - llvm::outs().flush(); - // Now we need to create a new L2 buffer based on `newTargetSizes`. - LogicalObjectFifoFromMemrefOp oldL2ObjectFifo = - op.getTargetObjectFifo(); - AMDAIE::LogicalObjectFifoFromMemrefOp newL2ObjectFifo = - createNewLogicalObjectFifo(rewriter, oldL2ObjectFifo, - newTargetSizes); - - // Create combined L3->L2 Dma. - rewriter.setInsertionPoint(op); - auto combinedL3ToL2DmaOp = rewriter.create( - op.getLoc(), newL2ObjectFifo, llvm::ArrayRef(newTargetOffsets), - llvm::ArrayRef(newTargetSizes), llvm::ArrayRef(newTargetStrides), - op.getSource(), llvm::ArrayRef(newSourceOffsets), - llvm::ArrayRef(newSourceSizes), llvm::ArrayRef(newSourceStrides)); - // Replace the uses of 2nd L3->L2 Dma with the new combined L3->L2 Dma - // and erase the 1st L3->L2 Dma. - rewriter.replaceOp(nextStridedOp, combinedL3ToL2DmaOp); - rewriter.eraseOp(op); - - // We now have need to create two L2->L1 ops since the size has changed. - // But for this we first need to find the new offset for L2 as source. - // TODO: For now I'm hardcoding the offsets but later it'd just depend - // on - // split/non-split dimensions. - // Offset = 0,0 - auto firstL2ToL1DmaOp = l2ToL1DmaOps[i]; - rewriter.setInsertionPoint(firstL2ToL1DmaOp); - LogicalObjectFifoFromMemrefOp reuseL1LogicalObjectFifoOp = - firstL2ToL1DmaOp.getTargetObjectFifo(); - SmallVector newL2AsSourceOffsets = - firstL2ToL1DmaOp.getSourceMixedOffsets(); - auto newFirstL2ToL1DmaOp = rewriter.create( - firstL2ToL1DmaOp.getLoc(), reuseL1LogicalObjectFifoOp, - firstL2ToL1DmaOp.getTargetMixedOffsets(), - firstL2ToL1DmaOp.getTargetMixedSizes(), - firstL2ToL1DmaOp.getTargetMixedStrides(), newL2ObjectFifo, - llvm::ArrayRef(newL2AsSourceOffsets), - firstL2ToL1DmaOp.getSourceMixedSizes(), - firstL2ToL1DmaOp.getSourceMixedStrides()); - rewriter.replaceOp(firstL2ToL1DmaOp, newFirstL2ToL1DmaOp); - // Offset = 0, 1. NOTE here we'd use the same L1 logical objectFifo as - // the first L2->L1 Dma. - auto secondL2ToL1DmaOp = l2ToL1DmaOps[i + 1]; - rewriter.setInsertionPoint(secondL2ToL1DmaOp); - newL2AsSourceOffsets = secondL2ToL1DmaOp.getSourceMixedOffsets(); - newL2AsSourceOffsets[1] = rewriter.getIndexAttr(1); - auto newSecondL2ToL1DmaOp = rewriter.create( - secondL2ToL1DmaOp.getLoc(), reuseL1LogicalObjectFifoOp, - secondL2ToL1DmaOp.getTargetMixedOffsets(), - secondL2ToL1DmaOp.getTargetMixedSizes(), - secondL2ToL1DmaOp.getTargetMixedStrides(), newL2ObjectFifo, - llvm::ArrayRef(newL2AsSourceOffsets), - secondL2ToL1DmaOp.getSourceMixedSizes(), - secondL2ToL1DmaOp.getSourceMixedStrides()); - rewriter.replaceOp(secondL2ToL1DmaOp, newSecondL2ToL1DmaOp); - - ///////////////////////////////////////////////////////// - //// PICK the CoreOps associated with the 1:1 L2->L1 //// - ///////////////////////////////////////////////////////// - // For the first Core op we'll insert Read at the end. It doesn't matter - // for now so we're gonna insert it right before amdaie.end op. - CoreOp firstCoreOp = fetchUniqueCoreOp(newFirstL2ToL1DmaOp); - firstCoreOp.walk([&](AMDAIE::EndOp endOp) { - OpBuilder::InsertionGuard guard(rewriter); - // Hardcoding to `AMDAIE::MemoryAccess::Read`. - rewriter.setInsertionPoint(endOp); - rewriter.create( - rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(), - AMDAIE::MemoryAccess::Read); - }); - // For the seconf Core op we'll insert Read right before the first read - // from the corresponding L1 logicalobjectFifo. - CoreOp secondCoreOp = fetchUniqueCoreOp(newSecondL2ToL1DmaOp); - secondCoreOp.walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) { - if (accessOp.getInput() == - l2ToL1DmaOps[i + 1].getTargetObjectFifo()) { - OpBuilder::InsertionGuard guard(rewriter); - // Hardcoding to `AMDAIE::MemoryAccess::Read`. - rewriter.setInsertionPoint(accessOp); + // Hardcoding to `AMDAIE::MemoryAccess::Read`. + rewriter.setInsertionPoint(endOp); + rewriter.create( + rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(), + AMDAIE::MemoryAccess::Read); + }); + // For the second Core op we'll insert `Read` right before the first read + // from the corresponding L1 logicalobjectFifo. + CoreOp secondCoreOp = fetchUniqueCoreOp(newSecondL2ToL1DmaOp); + secondCoreOp.walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) { + if (accessOp.getInput() == l2ToL1DmaOps[i + 1].getTargetObjectFifo()) { + OpBuilder::InsertionGuard guard(rewriter); + // Hardcoding to `AMDAIE::MemoryAccess::Read`. + rewriter.setInsertionPoint(accessOp); + rewriter.create( + rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(), + AMDAIE::MemoryAccess::Read); + // Need to insert the second one because THIS is what will actually + // be used. + auto secondAccessOp = rewriter.create( rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(), AMDAIE::MemoryAccess::Read); - // Need to insert the second one because THIS is what will actually - // be used. - auto secondAccessOp = - rewriter.create( - rewriter.getUnknownLoc(), - reuseL1LogicalObjectFifoOp.getOutput(), - AMDAIE::MemoryAccess::Read); - rewriter.replaceOp(accessOp, secondAccessOp); - } - }); + rewriter.replaceOp(accessOp, secondAccessOp); } - } + }); } return success(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index ba4380860..261a8068c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -19,6 +19,7 @@ iree_lit_test_suite( "canonicalize_dma.mlir" "canonicalize_doubly_strided_op.mlir" "canonicalize_npu_dma_cpy_nd.mlir" + "combine_logicalobjfifos_for_connection_reuse.mlir" "combine_strided_ops.mlir" "controlcode_loop_unrolling.mlir" "convert_core_forall_to_for.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir new file mode 100644 index 000000000..aee2023e3 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir @@ -0,0 +1,219 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-combine-logical-objectfifos-for-connection-reuse,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s + +// CHECK-DAG: #map = affine_map<(d0) -> (d0 * 64)> +// CHECK-DAG: #map1 = affine_map<(d0) -> (d0 * 64 + 32)> +// CHECK: @combine_logical_objFifos +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK: memref.alloc() : memref<1x2x32x32xi32, 1 : i32> +// CHECK: %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> +// CHECK: %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> +// CHECK: %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32> +// CHECK-DAG: %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> +// CHECK-DAG: %[[TILE_0:.*]] = amdaie.tile(%[[C1]], %[[C3]]) +// CHECK-DAG: %[[TILE_1:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: %[[TILE_2:.*]] = amdaie.tile(%[[C1]], %[[C2]]) +// CHECK-DAG: %[[TILE_3:.*]] = amdaie.tile(%[[C0]], %[[C3]]) +// CHECK: %[[L2_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_0]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L2_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_1]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} : +// CHECK-SAME: memref<128x128xi32> -> !amdaie.logicalobjectfifo> +// CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2) +// CHECK-DAG: %[[IV1_0:.*]] = affine.apply #map(%[[IV1]]) +// CHECK-DAG: %[[IV0_0:.*]] = affine.apply #map(%[[IV0]]) +// CHECK-DAG: %[[IV0_32:.*]] = affine.apply #map1(%[[IV0]]) +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_0]], %[[IV1_0]]] [1, 2, 32, 32] [4096, 32, 128, 1] +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32]], %[[IV1_0]]] [1, 2, 32, 32] [4096, 32, 128, 1] +// CHECK: %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out : +// CHECK: linalg.generic +// CHECK: %[[FIRST_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read) +// CHECK: linalg.generic +// CHECK-SAME: %[[FIRST_READ]] +// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read) +// CHECK: amdaie.end +// CHECK: } +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out : +// CHECK: linalg.generic +// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read) +// CHECK: %[[SECOND_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read) +// CHECK: linalg.generic +// CHECK-SAME: %[[SECOND_READ]] +// CHECK: amdaie.end +// CHECK: } +// CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_2]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_2:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_2]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_2]]], out : +// CHECK: linalg.generic +// CHECK: %[[FIRST_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read) +// CHECK: linalg.generic +// CHECK-SAME: %[[FIRST_READ]] +// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read) +// CHECK: amdaie.end +// CHECK: } +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_3:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_3]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_3]]], out : +// CHECK: linalg.generic +// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read) +// CHECK: %[[SECOND_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read) +// CHECK: linalg.generic +// CHECK-SAME: %[[SECOND_READ]] +// CHECK: amdaie.end +// CHECK: } +#map = affine_map<(d0) -> (d0 * 64)> +#map1 = affine_map<(d0) -> (d0 * 64 + 32)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> +#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)> +#map4 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)> +#map5 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)> +module { + func.func @combine_logical_objFifos(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { + %c3 = arith.constant 3 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> + %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> + %alloc_1 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> + %alloc_2 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> + %alloc_3 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> + %alloc_4 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> + %alloc_5 = memref.alloc() : memref<128x128xi32> + %alloc_6 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %tile = amdaie.tile(%c1, %c3) + %tile_7 = amdaie.tile(%c0, %c2) + %tile_8 = amdaie.tile(%c1, %c2) + %tile_9 = amdaie.tile(%c0, %c3) + %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %2 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %4 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg4, %arg5) in (2, 2) { + %5 = affine.apply #map(%arg5) + %6 = affine.apply #map1(%arg5) + %7 = affine.apply #map(%arg4) + %8 = affine.apply #map1(%arg4) + %9 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %7, %5] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %10 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %7, %6] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %11 = amdaie.dma_cpy_nd(%2[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %8, %5] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %12 = amdaie.dma_cpy_nd(%3[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %8, %6] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %13 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %14 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %15 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %13[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %16 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %14[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %17 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %18 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_7} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %19 = amdaie.dma_cpy_nd(%18[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %20 = amdaie.core(%tile_7, in : [%15, %16, %19], out : [%17]) { + %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.muli %in, %in_10 : i32 + %36 = arith.addi %out, %35 : i32 + linalg.yield %36 : i32 + } + %33 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.addi %in, %in_10 : i32 + linalg.yield %35 : i32 + } + amdaie.end + } + %21 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %22 = amdaie.dma_cpy_nd(%21[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %1[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %23 = amdaie.core(%tile, in : [%15, %16, %22], out : [%17]) { + %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.muli %in, %in_10 : i32 + %36 = arith.addi %out, %35 : i32 + linalg.yield %36 : i32 + } + %33 = amdaie.logicalobjectfifo.access(%21, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.addi %in, %in_10 : i32 + linalg.yield %35 : i32 + } + amdaie.end + } + %24 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_8} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %25 = amdaie.dma_cpy_nd(%24[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %26 = amdaie.core(%tile_8, in : [%15, %16, %25], out : [%17]) { + %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.muli %in, %in_10 : i32 + %36 = arith.addi %out, %35 : i32 + linalg.yield %36 : i32 + } + %33 = amdaie.logicalobjectfifo.access(%24, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.addi %in, %in_10 : i32 + linalg.yield %35 : i32 + } + amdaie.end + } + %27 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_9} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %28 = amdaie.dma_cpy_nd(%27[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %3[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %29 = amdaie.core(%tile_9, in : [%15, %16, %28], out : [%17]) { + %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.muli %in, %in_10 : i32 + %36 = arith.addi %out, %35 : i32 + linalg.yield %36 : i32 + } + %33 = amdaie.logicalobjectfifo.access(%27, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.addi %in, %in_10 : i32 + linalg.yield %35 : i32 + } + amdaie.end + } + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32> + memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_5 : memref<128x128xi32> + memref.dealloc %alloc_1 : memref<1x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_2 : memref<1x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_3 : memref<1x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_4 : memref<1x1x32x32xi32, 1 : i32> + return + } +}