From be4f6f9dc53f22d4794d0ae11e4ba0dd49f85ab2 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Mon, 9 Sep 2024 12:27:53 +0000
Subject: [PATCH] Final before PR review begins

---
 .../AMDAIELogicalObjFifoSplittingUtils.cpp    | 360 +++++++++---------
 .../Transforms/test/CMakeLists.txt            |   1 +
 ..._logicalobjfifos_for_connection_reuse.mlir | 219 +++++++++++
 3 files changed, 410 insertions(+), 170 deletions(-)
 create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp
index 42293ff66..de00f04f8 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp
@@ -476,7 +476,25 @@ LogicalResult splitLogicalObjectFifos(
   return success();
 }
 
-static LogicalResult _TODOcombineAccessPatterns(
+static int64_t fetchOffsetBias(OpFoldResult offsetOpFoldResult) {
+  std::optional<int64_t> offset = getConstantIntValue(offsetOpFoldResult);
+  if (offset) return offset.value();
+  auto offsetVal = cast<Value>(offsetOpFoldResult);
+  auto affineApplyOp =
+      dyn_cast_if_present<affine::AffineApplyOp>(offsetVal.getDefiningOp());
+  if (!affineApplyOp) return 0;
+  AffineMap affineMap = affineApplyOp.getAffineMap();
+  RetrieveScaleAndBias retriever;
+  assert(!failed(retriever.visit(affineMap.getResult(0))) &&
+         "failed to retrieve scale and bias");
+  int64_t bias = 0;
+  if (retriever.bias) {
+    bias = retriever.bias.value();
+  }
+  return bias;
+}
+
+static LogicalResult combineL3ToL2AccessPatterns(
     RewriterBase &rewriter, const SmallVector<OpFoldResult> &offsetsA,
     const SmallVector<OpFoldResult> &sizesA,
     const SmallVector<OpFoldResult> &stridesA,
@@ -484,32 +502,80 @@ static LogicalResult _TODOcombineAccessPatterns(
     const SmallVector<OpFoldResult> &sizesB,
     const SmallVector<OpFoldResult> &stridesB,
     SmallVector<OpFoldResult> &newOffsets, SmallVector<OpFoldResult> &newSizes,
-    SmallVector<OpFoldResult> &newStrides) {
-  // TODO: Move these checks later in a separate func.
-  assert(offsetsA.size() == offsetsB.size() &&
-         "expected same number of source offsets and target offsets");
-  assert(offsetsA.size() == sizesA.size() &&
-         "expected same number of source offsets and sizes");
-  assert(offsetsA.size() == stridesA.size() &&
-         "expected same number of source offsets and strides");
-  assert(offsetsB.size() == sizesB.size() &&
-         "expected same number of target offsets and sizes");
-  assert(offsetsB.size() == stridesB.size() &&
-         "expected same number of target offsets and strides");
-
+    SmallVector<OpFoldResult> &newStrides, SmallVector<int64_t> &splitDims,
+    SmallVector<int64_t> &nonSplitDims) {
   if (offsetsA.empty() && offsetsB.empty()) return success();
 
+  int64_t newSize = 1;
   for (auto iter : llvm::enumerate(llvm::zip(offsetsA, offsetsB))) {
+    if (iter.index() < splitDims.size()) continue;
     const OpFoldResult &offsetA = std::get<0>(iter.value());
     const OpFoldResult &offsetB = std::get<1>(iter.value());
     if (offsetA != offsetB) {
       // Need to check the difference in bias here.
+      int64_t biasA = fetchOffsetBias(offsetA);
+      int64_t biasB = fetchOffsetBias(offsetB);
+      std::optional<int64_t> sizeA = getConstantIntValue(sizesA[iter.index()]);
+      assert(sizeA && "expected a constant integer value for size");
+      assert((sizeA == biasB - biasA) &&
+             "L3->L2 pair cannot be combined because offset is not contiguous");
+      newSize++;
     }
   }
-  newSizes[1] = rewriter.getI64IntegerAttr(2);
+  newSizes[splitDims.size() - 1] = rewriter.getI64IntegerAttr(newSize);
   return success();
 }
 
+static FailureOr<LogicalObjectFifoFromMemrefOp> combineL3ToL2Pair(
+    IRRewriter &rewriter, DmaCpyNdOp dmaOpA, DmaCpyNdOp dmaOpB,
+    SmallVector<int64_t> &splitDims, SmallVector<int64_t> &nonSplitDims) {
+  OpBuilder::InsertionGuard guard(rewriter);
+  SmallVector<OpFoldResult> sourceOffsetsA = dmaOpA.getSourceMixedOffsets();
+  SmallVector<OpFoldResult> sourceSizesA = dmaOpA.getSourceMixedSizes();
+  SmallVector<OpFoldResult> sourceStridesA = dmaOpA.getSourceMixedStrides();
+  SmallVector<OpFoldResult> sourceOffsetsB = dmaOpB.getSourceMixedOffsets();
+  SmallVector<OpFoldResult> sourceSizesB = dmaOpB.getSourceMixedSizes();
+  SmallVector<OpFoldResult> sourceStridesB = dmaOpB.getSourceMixedStrides();
+
+  SmallVector<OpFoldResult> targetOffsetsA = dmaOpA.getTargetMixedOffsets();
+  SmallVector<OpFoldResult> targetSizesA = dmaOpA.getTargetMixedSizes();
+  SmallVector<OpFoldResult> targetStridesA = dmaOpA.getTargetMixedStrides();
+  SmallVector<OpFoldResult> targetOffsetsB = dmaOpB.getTargetMixedOffsets();
+  SmallVector<OpFoldResult> targetSizesB = dmaOpB.getTargetMixedSizes();
+  SmallVector<OpFoldResult> targetStridesB = dmaOpB.getTargetMixedStrides();
+
+  SmallVector<OpFoldResult> newSourceOffsets = sourceOffsetsA;
+  SmallVector<OpFoldResult> newSourceSizes = sourceSizesA;
+  SmallVector<OpFoldResult> newSourceStrides = sourceStridesA;
+  if (failed(combineL3ToL2AccessPatterns(
+          rewriter, sourceOffsetsA, sourceSizesA, sourceStridesA,
+          sourceOffsetsB, sourceSizesB, sourceStridesB, newSourceOffsets,
+          newSourceSizes, newSourceStrides, splitDims, nonSplitDims))) {
+    return failure();
+  }
+
+  SmallVector<OpFoldResult> newTargetOffsets = targetOffsetsA;
+  SmallVector<OpFoldResult> newTargetSizes = newSourceSizes;
+  SmallVector<OpFoldResult> newTargetStrides = targetStridesA;
+  // Now we need to create a new L2 buffer based on `newTargetSizes`.
+  LogicalObjectFifoFromMemrefOp oldL2ObjectFifo = dmaOpA.getTargetObjectFifo();
+  AMDAIE::LogicalObjectFifoFromMemrefOp newL2ObjectFifo =
+      createNewLogicalObjectFifo(rewriter, oldL2ObjectFifo, newTargetSizes);
+
+  // Create combined L3->L2 Dma.
+  rewriter.setInsertionPoint(dmaOpA);
+  auto combinedL3ToL2DmaOp = rewriter.create<AMDAIE::DmaCpyNdOp>(
+      dmaOpA.getLoc(), newL2ObjectFifo, llvm::ArrayRef(newTargetOffsets),
+      llvm::ArrayRef(newTargetSizes), llvm::ArrayRef(newTargetStrides),
+      dmaOpA.getSource(), llvm::ArrayRef(newSourceOffsets),
+      llvm::ArrayRef(newSourceSizes), llvm::ArrayRef(newSourceStrides));
+  // Replace the uses of 2nd L3->L2 Dma with the new combined L3->L2 Dma
+  // and erase the 1st L3->L2 Dma.
+  rewriter.replaceOp(dmaOpB, combinedL3ToL2DmaOp);
+  rewriter.eraseOp(dmaOpA);
+  return newL2ObjectFifo;
+}
+
 /// Utility to fetch a unique CoreOp associated with a L2->L1 Dma op.
 static CoreOp fetchUniqueCoreOp(DmaCpyNdOp &l2ToL1DmaOp) {
   SmallVector<CoreOp> coreOps;
@@ -580,6 +646,32 @@ static bool compareL3ToL2DmaPair(DmaCpyNdOp &a, DmaCpyNdOp &b) {
   return false;
 }
 
+static LogicalResult checkIfSameDimensionalityAccessPatterns(
+    AMDAIE::DmaCpyNdOp &l3ToL2DmaOpA, AMDAIE::DmaCpyNdOp &l3ToL2DmaOpB) {
+  SmallVector<OpFoldResult> sourceOffsetsA =
+      l3ToL2DmaOpA.getSourceMixedOffsets();
+  SmallVector<OpFoldResult> sourceSizesA = l3ToL2DmaOpA.getSourceMixedSizes();
+  SmallVector<OpFoldResult> sourceStridesA =
+      l3ToL2DmaOpA.getSourceMixedStrides();
+  SmallVector<OpFoldResult> sourceOffsetsB =
+      l3ToL2DmaOpB.getSourceMixedOffsets();
+  SmallVector<OpFoldResult> sourceSizesB = l3ToL2DmaOpB.getSourceMixedSizes();
+  SmallVector<OpFoldResult> sourceStridesB =
+      l3ToL2DmaOpB.getSourceMixedStrides();
+  if (sourceOffsetsA.size() != sourceOffsetsB.size() ||
+      sourceSizesA.size() != sourceSizesB.size() ||
+      sourceStridesA.size() != sourceStridesB.size() ||
+      sourceOffsetsA.size() != sourceSizesA.size() ||
+      sourceOffsetsA.size() != sourceStridesB.size()) {
+    return failure();
+  }
+  return success();
+}
+
+/// Given a vector of L2->L1 Dma Ops, combine the corresponding L3->L2 Dma Ops
+/// and reuse the L2/L1 buffers.
+/// TODO(avarma): Assign combined tiles while forming L2/L1 buffers which we'll
+/// reuse.
 LogicalResult combineLogicalObjectFifos(
     IRRewriter &rewriter, SmallVector<AMDAIE::DmaCpyNdOp> &l2ToL1DmaOps,
     MLIRContext *context) {
@@ -605,6 +697,12 @@ LogicalResult combineLogicalObjectFifos(
                  << " and " << l3ToL2DmaOps[i] << "\n");
       return failure();
     }
+    if (failed(checkIfSameDimensionalityAccessPatterns(l3ToL2DmaOps[0],
+                                                       l3ToL2DmaOps[i]))) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Found different dimensionality of access patterns\n");
+      return failure();
+    }
   }
 
   if (l2ToL1DmaOps.size() != l3ToL2DmaOps.size()) {
@@ -647,9 +745,9 @@ LogicalResult combineLogicalObjectFifos(
       return failure();
     }
   }
-  SmallVector<int64_t> splitDims(maxSplitDimIndex + 1);
+  SmallVector<int64_t> splitDims(maxSplitDimIndex);
   std::iota(splitDims.begin(), splitDims.end(), 0);
-  SmallVector<int64_t> nonSplitDims(maxSplitDimIndex + 1);
+  SmallVector<int64_t> nonSplitDims(maxSplitDimIndex);
   std::iota(nonSplitDims.begin(), nonSplitDims.end(), splitDims.size());
 
   // At this point it's nice to perhaps just sort the L3->L2 Dma ops based on
@@ -668,171 +766,93 @@ LogicalResult combineLogicalObjectFifos(
     l2ToL1DmaOps[j + 1] = currL2ToL1DmaOp;
   }
 
-  for (auto x : l3ToL2DmaOps) {
-    llvm::outs() << "===> " << x << "\n";
-    llvm::outs().flush();
-  }
-  // For now pick the first two L3->L2 Dma op and try to combine them. Later
-  // we'll implement the selector.
-  ////////////////////////////////////////////////
-  ////////////// PICK logic TODO /////////////////
-  ////////////////////////////////////////////////
   // Currently we have 4 cores so there are two pairs of DmaCpyNds to combine.
   // TODO(avarma): Revisit this later when we want to target more no. of cores.
-  if (l3ToL2DmaOps.size() != 4) {
+  if (l3ToL2DmaOps.size() % 2 == 0) {
     LLVM_DEBUG(llvm::dbgs()
-               << "currently only 4 L3->L2 ops are supported for combining\n");
+               << "found uneven L3->L2 ops for combining\n");
     return failure();
   }
+
+  auto createL2ToL1ForReuse =
+      [](IRRewriter &rewriter, DmaCpyNdOp &l2ToL1DmaOp,
+         LogicalObjectFifoFromMemrefOp &reuseL1Buffer,
+         LogicalObjectFifoFromMemrefOp &reuseL2Buffer,
+         SmallVector<OpFoldResult> &newL2SourceOffsets) -> DmaCpyNdOp {
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPoint(l2ToL1DmaOp);
+    auto newL2ToL1DmaOp = rewriter.create<AMDAIE::DmaCpyNdOp>(
+        l2ToL1DmaOp.getLoc(), reuseL1Buffer,
+        l2ToL1DmaOp.getTargetMixedOffsets(), l2ToL1DmaOp.getTargetMixedSizes(),
+        l2ToL1DmaOp.getTargetMixedStrides(), reuseL2Buffer,
+        llvm::ArrayRef(newL2SourceOffsets), l2ToL1DmaOp.getSourceMixedSizes(),
+        l2ToL1DmaOp.getSourceMixedStrides());
+    rewriter.replaceOp(l2ToL1DmaOp, newL2ToL1DmaOp);
+    return newL2ToL1DmaOp;
+  };
   for (unsigned i = 0, n = l3ToL2DmaOps.size(); i < n; i += 2) {
-    auto op = l3ToL2DmaOps[i];
-    auto nextStridedOp = l3ToL2DmaOps[i + 1];
-    ////////////////////////////////////////////////
-    /////// COMBINE the picked L3->L2 pair /////////
-    ////////////////////////////////////////////////
-    {
+    // Step 1. Combine the picked L3->L2 DmaCpyNd pair.
+    FailureOr<LogicalObjectFifoFromMemrefOp> maybeNewL2ObjectFifo =
+        combineL3ToL2Pair(rewriter, l3ToL2DmaOps[i], l3ToL2DmaOps[i + 1],
+                          splitDims, nonSplitDims);
+    if (failed(maybeNewL2ObjectFifo)) return failure();
+    LogicalObjectFifoFromMemrefOp newL2ObjectFifo =
+        maybeNewL2ObjectFifo.value();
+
+    // Step 2. We now have need to create two L2->L1 ops since the size has
+    // changed. But for this we first need to find the new offset for L2 as
+    // source.
+    // TODO: For now I'm hardcoding the offsets but later it'd just depend
+    // on split/non-split dimensions.
+    // Offset = 0,0
+    LogicalObjectFifoFromMemrefOp reuseL1LogicalObjectFifoOp =
+        l2ToL1DmaOps[i].getTargetObjectFifo();
+    SmallVector<OpFoldResult> newL2AsSourceOffsets =
+        l2ToL1DmaOps[i].getSourceMixedOffsets();
+    DmaCpyNdOp newFirstL2ToL1DmaOp = createL2ToL1ForReuse(
+        rewriter, l2ToL1DmaOps[i], reuseL1LogicalObjectFifoOp, newL2ObjectFifo,
+        newL2AsSourceOffsets);
+    // Offset = 0, 1. NOTE here we'd use the same L1 logical objectFifo as
+    // the first L2->L1 Dma.
+    newL2AsSourceOffsets = l2ToL1DmaOps[i + 1].getSourceMixedOffsets();
+    newL2AsSourceOffsets[1] = rewriter.getIndexAttr(1);
+    DmaCpyNdOp newSecondL2ToL1DmaOp = createL2ToL1ForReuse(
+        rewriter, l2ToL1DmaOps[i + 1], reuseL1LogicalObjectFifoOp,
+        newL2ObjectFifo, newL2AsSourceOffsets);
+
+    // Step 3. PICK the CoreOps associated with the 1:1 L2->L1.
+    // For the first Core op we'll insert Read at the end. It doesn't matter
+    // for now so we're gonna insert it right before amdaie.end op.
+    CoreOp firstCoreOp = fetchUniqueCoreOp(newFirstL2ToL1DmaOp);
+    firstCoreOp.walk([&](AMDAIE::EndOp endOp) {
       OpBuilder::InsertionGuard guard(rewriter);
-      SmallVector<OpFoldResult> sourceOffsetsA = op.getSourceMixedOffsets();
-      SmallVector<OpFoldResult> sourceSizesA = op.getSourceMixedSizes();
-      SmallVector<OpFoldResult> sourceStridesA = op.getSourceMixedStrides();
-      SmallVector<OpFoldResult> sourceOffsetsB =
-          nextStridedOp.getSourceMixedOffsets();
-      SmallVector<OpFoldResult> sourceSizesB =
-          nextStridedOp.getSourceMixedSizes();
-      SmallVector<OpFoldResult> sourceStridesB =
-          nextStridedOp.getSourceMixedStrides();
-      bool areSourcesCombinable = true;
-
-      SmallVector<OpFoldResult> targetOffsetsA = op.getTargetMixedOffsets();
-      SmallVector<OpFoldResult> targetSizesA = op.getTargetMixedSizes();
-      SmallVector<OpFoldResult> targetStridesA = op.getTargetMixedStrides();
-      SmallVector<OpFoldResult> targetOffsetsB =
-          nextStridedOp.getTargetMixedOffsets();
-      SmallVector<OpFoldResult> targetSizesB =
-          nextStridedOp.getTargetMixedSizes();
-      SmallVector<OpFoldResult> targetStridesB =
-          nextStridedOp.getTargetMixedStrides();
-      bool areTargetsCombinable = true;
-
-      if (areSourcesCombinable && areTargetsCombinable) {
-        SmallVector<OpFoldResult> newSourceOffsets = sourceOffsetsA;
-        SmallVector<OpFoldResult> newSourceSizes = sourceSizesA;
-        SmallVector<OpFoldResult> newSourceStrides = sourceStridesA;
-        if (failed(_TODOcombineAccessPatterns(
-                rewriter, sourceOffsetsA, sourceSizesA, sourceStridesA,
-                sourceOffsetsB, sourceSizesB, sourceStridesB, newSourceOffsets,
-                newSourceSizes, newSourceStrides))) {
-          return failure();
-        }
-        llvm::outs() << "Combined sources\n";
-        llvm::outs().flush();
-
-        SmallVector<OpFoldResult> newTargetOffsets = targetOffsetsA;
-        SmallVector<OpFoldResult> newTargetSizes = targetSizesA;
-        SmallVector<OpFoldResult> newTargetStrides = targetStridesA;
-        if (failed(_TODOcombineAccessPatterns(
-                rewriter, targetOffsetsA, targetSizesA, targetStridesA,
-                targetOffsetsB, targetSizesB, targetStridesB, newTargetOffsets,
-                newTargetSizes, newTargetStrides))) {
-          return failure();
-        }
-        llvm::outs() << "Combined target\n";
-        llvm::outs().flush();
-        // Now we need to create a new L2 buffer based on `newTargetSizes`.
-        LogicalObjectFifoFromMemrefOp oldL2ObjectFifo =
-            op.getTargetObjectFifo();
-        AMDAIE::LogicalObjectFifoFromMemrefOp newL2ObjectFifo =
-            createNewLogicalObjectFifo(rewriter, oldL2ObjectFifo,
-                                       newTargetSizes);
-
-        // Create combined L3->L2 Dma.
-        rewriter.setInsertionPoint(op);
-        auto combinedL3ToL2DmaOp = rewriter.create<AMDAIE::DmaCpyNdOp>(
-            op.getLoc(), newL2ObjectFifo, llvm::ArrayRef(newTargetOffsets),
-            llvm::ArrayRef(newTargetSizes), llvm::ArrayRef(newTargetStrides),
-            op.getSource(), llvm::ArrayRef(newSourceOffsets),
-            llvm::ArrayRef(newSourceSizes), llvm::ArrayRef(newSourceStrides));
-        // Replace the uses of 2nd L3->L2 Dma with the new combined L3->L2 Dma
-        // and erase the 1st L3->L2 Dma.
-        rewriter.replaceOp(nextStridedOp, combinedL3ToL2DmaOp);
-        rewriter.eraseOp(op);
-
-        // We now have need to create two L2->L1 ops since the size has changed.
-        // But for this we first need to find the new offset for L2 as source.
-        // TODO: For now I'm hardcoding the offsets but later it'd just depend
-        // on
-        //       split/non-split dimensions.
-        // Offset = 0,0
-        auto firstL2ToL1DmaOp = l2ToL1DmaOps[i];
-        rewriter.setInsertionPoint(firstL2ToL1DmaOp);
-        LogicalObjectFifoFromMemrefOp reuseL1LogicalObjectFifoOp =
-            firstL2ToL1DmaOp.getTargetObjectFifo();
-        SmallVector<OpFoldResult> newL2AsSourceOffsets =
-            firstL2ToL1DmaOp.getSourceMixedOffsets();
-        auto newFirstL2ToL1DmaOp = rewriter.create<AMDAIE::DmaCpyNdOp>(
-            firstL2ToL1DmaOp.getLoc(), reuseL1LogicalObjectFifoOp,
-            firstL2ToL1DmaOp.getTargetMixedOffsets(),
-            firstL2ToL1DmaOp.getTargetMixedSizes(),
-            firstL2ToL1DmaOp.getTargetMixedStrides(), newL2ObjectFifo,
-            llvm::ArrayRef(newL2AsSourceOffsets),
-            firstL2ToL1DmaOp.getSourceMixedSizes(),
-            firstL2ToL1DmaOp.getSourceMixedStrides());
-        rewriter.replaceOp(firstL2ToL1DmaOp, newFirstL2ToL1DmaOp);
-        // Offset = 0, 1. NOTE here we'd use the same L1 logical objectFifo as
-        // the first L2->L1 Dma.
-        auto secondL2ToL1DmaOp = l2ToL1DmaOps[i + 1];
-        rewriter.setInsertionPoint(secondL2ToL1DmaOp);
-        newL2AsSourceOffsets = secondL2ToL1DmaOp.getSourceMixedOffsets();
-        newL2AsSourceOffsets[1] = rewriter.getIndexAttr(1);
-        auto newSecondL2ToL1DmaOp = rewriter.create<AMDAIE::DmaCpyNdOp>(
-            secondL2ToL1DmaOp.getLoc(), reuseL1LogicalObjectFifoOp,
-            secondL2ToL1DmaOp.getTargetMixedOffsets(),
-            secondL2ToL1DmaOp.getTargetMixedSizes(),
-            secondL2ToL1DmaOp.getTargetMixedStrides(), newL2ObjectFifo,
-            llvm::ArrayRef(newL2AsSourceOffsets),
-            secondL2ToL1DmaOp.getSourceMixedSizes(),
-            secondL2ToL1DmaOp.getSourceMixedStrides());
-        rewriter.replaceOp(secondL2ToL1DmaOp, newSecondL2ToL1DmaOp);
-
-        /////////////////////////////////////////////////////////
-        //// PICK the CoreOps associated with the 1:1 L2->L1 ////
-        /////////////////////////////////////////////////////////
-        // For the first Core op we'll insert Read at the end. It doesn't matter
-        // for now so we're gonna insert it right before amdaie.end op.
-        CoreOp firstCoreOp = fetchUniqueCoreOp(newFirstL2ToL1DmaOp);
-        firstCoreOp.walk([&](AMDAIE::EndOp endOp) {
-          OpBuilder::InsertionGuard guard(rewriter);
-          // Hardcoding to `AMDAIE::MemoryAccess::Read`.
-          rewriter.setInsertionPoint(endOp);
-          rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
-              rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(),
-              AMDAIE::MemoryAccess::Read);
-        });
-        // For the seconf Core op we'll insert Read right before the first read
-        // from the corresponding L1 logicalobjectFifo.
-        CoreOp secondCoreOp = fetchUniqueCoreOp(newSecondL2ToL1DmaOp);
-        secondCoreOp.walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) {
-          if (accessOp.getInput() ==
-              l2ToL1DmaOps[i + 1].getTargetObjectFifo()) {
-            OpBuilder::InsertionGuard guard(rewriter);
-            // Hardcoding to `AMDAIE::MemoryAccess::Read`.
-            rewriter.setInsertionPoint(accessOp);
+      // Hardcoding to `AMDAIE::MemoryAccess::Read`.
+      rewriter.setInsertionPoint(endOp);
+      rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
+          rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(),
+          AMDAIE::MemoryAccess::Read);
+    });
+    // For the second Core op we'll insert `Read` right before the first read
+    // from the corresponding L1 logicalobjectFifo.
+    CoreOp secondCoreOp = fetchUniqueCoreOp(newSecondL2ToL1DmaOp);
+    secondCoreOp.walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) {
+      if (accessOp.getInput() == l2ToL1DmaOps[i + 1].getTargetObjectFifo()) {
+        OpBuilder::InsertionGuard guard(rewriter);
+        // Hardcoding to `AMDAIE::MemoryAccess::Read`.
+        rewriter.setInsertionPoint(accessOp);
+        rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
+            rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(),
+            AMDAIE::MemoryAccess::Read);
+        // Need to insert the second one because THIS is what will actually
+        // be used.
+        auto secondAccessOp =
             rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
                 rewriter.getUnknownLoc(),
                 reuseL1LogicalObjectFifoOp.getOutput(),
                 AMDAIE::MemoryAccess::Read);
-            // Need to insert the second one because THIS is what will actually
-            // be used.
-            auto secondAccessOp =
-                rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
-                    rewriter.getUnknownLoc(),
-                    reuseL1LogicalObjectFifoOp.getOutput(),
-                    AMDAIE::MemoryAccess::Read);
-            rewriter.replaceOp(accessOp, secondAccessOp);
-          }
-        });
+        rewriter.replaceOp(accessOp, secondAccessOp);
       }
-    }
+    });
   }
 
   return success();
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
index ba4380860..261a8068c 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
@@ -19,6 +19,7 @@ iree_lit_test_suite(
     "canonicalize_dma.mlir"
     "canonicalize_doubly_strided_op.mlir"
     "canonicalize_npu_dma_cpy_nd.mlir"
+    "combine_logicalobjfifos_for_connection_reuse.mlir"
     "combine_strided_ops.mlir"
     "controlcode_loop_unrolling.mlir"
     "convert_core_forall_to_for.mlir"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir
new file mode 100644
index 000000000..aee2023e3
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir
@@ -0,0 +1,219 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-combine-logical-objectfifos-for-connection-reuse,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s
+
+//   CHECK-DAG: #map = affine_map<(d0) -> (d0 * 64)>
+//   CHECK-DAG: #map1 = affine_map<(d0) -> (d0 * 64 + 32)>
+//       CHECK: @combine_logical_objFifos
+//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
+//       CHECK:   memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+//       CHECK:   %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+//       CHECK:   %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+//       CHECK:   %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32>
+//   CHECK-DAG:   %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+//   CHECK-DAG:   %[[TILE_0:.*]] = amdaie.tile(%[[C1]], %[[C3]])
+//   CHECK-DAG:   %[[TILE_1:.*]] = amdaie.tile(%[[C0]], %[[C2]])
+//   CHECK-DAG:   %[[TILE_2:.*]] = amdaie.tile(%[[C1]], %[[C2]])
+//   CHECK-DAG:   %[[TILE_3:.*]] = amdaie.tile(%[[C0]], %[[C3]])
+//       CHECK:   %[[L2_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_0]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
+//       CHECK:   %[[L2_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_1]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
+//       CHECK:   %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+//       CHECK:   scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2)
+//   CHECK-DAG:       %[[IV1_0:.*]] = affine.apply #map(%[[IV1]])
+//   CHECK-DAG:       %[[IV0_0:.*]] = affine.apply #map(%[[IV0]])
+//   CHECK-DAG:       %[[IV0_32:.*]] = affine.apply #map1(%[[IV0]])
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, %[[IV0_0]], %[[IV1_0]]] [1, 2, 32, 32] [4096, 32, 128, 1]
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32]], %[[IV1_0]]] [1, 2, 32, 32] [4096, 32, 128, 1]
+//       CHECK:       %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]}
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:         %[[FIRST_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
+//       CHECK:         linalg.generic
+//  CHECK-SAME:             %[[FIRST_READ]]
+//       CHECK:         amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
+//       CHECK:         amdaie.end
+//       CHECK:       }
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L2_OBJECTFIFO_0]][0, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:         amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
+//       CHECK:         %[[SECOND_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
+//       CHECK:         linalg.generic
+//  CHECK-SAME:             %[[SECOND_READ]]
+//       CHECK:         amdaie.end
+//       CHECK:       }
+//       CHECK:       %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_2]]}
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_2:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_2]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_2]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:         %[[FIRST_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
+//       CHECK:         linalg.generic
+//  CHECK-SAME:             %[[FIRST_READ]]
+//       CHECK:         amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
+//       CHECK:         amdaie.end
+//       CHECK:       }
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_3:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L2_OBJECTFIFO_1]][0, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_3]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_3]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:         amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
+//       CHECK:         %[[SECOND_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
+//       CHECK:         linalg.generic
+//  CHECK-SAME:             %[[SECOND_READ]]
+//       CHECK:         amdaie.end
+//       CHECK:       }
+#map = affine_map<(d0) -> (d0 * 64)>
+#map1 = affine_map<(d0) -> (d0 * 64 + 32)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>
+#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>
+#map4 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>
+#map5 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
+module {
+  func.func @combine_logical_objFifos(%arg0: !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, %arg2: !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, %arg3: !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>) {
+    %c3 = arith.constant 3 : index
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
+    %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+    %alloc_2 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+    %alloc_3 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+    %alloc_4 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+    %alloc_5 = memref.alloc() : memref<128x128xi32>
+    %alloc_6 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+    %tile = amdaie.tile(%c1, %c3)
+    %tile_7 = amdaie.tile(%c0, %c2)
+    %tile_8 = amdaie.tile(%c1, %c2)
+    %tile_9 = amdaie.tile(%c0, %c3)
+    %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+    %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+    %2 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+    %3 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+    %4 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+    scf.forall (%arg4, %arg5) in (2, 2) {
+      %5 = affine.apply #map(%arg5)
+      %6 = affine.apply #map1(%arg5)
+      %7 = affine.apply #map(%arg4)
+      %8 = affine.apply #map1(%arg4)
+      %9 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %7, %5] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %10 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %7, %6] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %11 = amdaie.dma_cpy_nd(%2[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %8, %5] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %12 = amdaie.dma_cpy_nd(%3[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %8, %6] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %13 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+      %14 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
+      %15 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %13[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %16 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %14[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
+      %17 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %18 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_7} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %19 = amdaie.dma_cpy_nd(%18[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
+      %20 = amdaie.core(%tile_7, in : [%15, %16, %19], out : [%17]) {
+        %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.muli %in, %in_10 : i32
+          %36 = arith.addi %out, %35 : i32
+          linalg.yield %36 : i32
+        }
+        %33 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.addi %in, %in_10 : i32
+          linalg.yield %35 : i32
+        }
+        amdaie.end
+      }
+      %21 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %22 = amdaie.dma_cpy_nd(%21[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %1[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
+      %23 = amdaie.core(%tile, in : [%15, %16, %22], out : [%17]) {
+        %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.muli %in, %in_10 : i32
+          %36 = arith.addi %out, %35 : i32
+          linalg.yield %36 : i32
+        }
+        %33 = amdaie.logicalobjectfifo.access(%21, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.addi %in, %in_10 : i32
+          linalg.yield %35 : i32
+        }
+        amdaie.end
+      }
+      %24 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_8} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %25 = amdaie.dma_cpy_nd(%24[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
+      %26 = amdaie.core(%tile_8, in : [%15, %16, %25], out : [%17]) {
+        %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.muli %in, %in_10 : i32
+          %36 = arith.addi %out, %35 : i32
+          linalg.yield %36 : i32
+        }
+        %33 = amdaie.logicalobjectfifo.access(%24, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.addi %in, %in_10 : i32
+          linalg.yield %35 : i32
+        }
+        amdaie.end
+      }
+      %27 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_9} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %28 = amdaie.dma_cpy_nd(%27[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %3[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
+      %29 = amdaie.core(%tile_9, in : [%15, %16, %28], out : [%17]) {
+        %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.muli %in, %in_10 : i32
+          %36 = arith.addi %out, %35 : i32
+          linalg.yield %36 : i32
+        }
+        %33 = amdaie.logicalobjectfifo.access(%27, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.addi %in, %in_10 : i32
+          linalg.yield %35 : i32
+        }
+        amdaie.end
+      }
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32>
+    memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_5 : memref<128x128xi32>
+    memref.dealloc %alloc_1 : memref<1x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_2 : memref<1x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_3 : memref<1x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_4 : memref<1x1x32x32xi32, 1 : i32>
+    return
+  }
+}