diff --git a/build_tools/build_test_cpp.ps1 b/build_tools/build_test_cpp.ps1
index f190dc192..294fee6cf 100644
--- a/build_tools/build_test_cpp.ps1
+++ b/build_tools/build_test_cpp.ps1
@@ -58,7 +58,7 @@ echo "Building IREE"
 
 $CMAKE_ARGS = @(
     "-GNinja"
-    "-DCMAKE_BUILD_TYPE=Release"
+    "-DCMAKE_BUILD_TYPE=Debug"
     "-DCMAKE_INSTALL_PREFIX=$install_dir"
     "-DCMAKE_INSTALL_LIBDIR=lib"
     "-DCMAKE_EXE_LINKER_FLAGS_INIT=-fuse-ld=lld"
diff --git a/build_tools/build_test_cpp.sh b/build_tools/build_test_cpp.sh
index 1b0b1ac28..7bd9e96b4 100644
--- a/build_tools/build_test_cpp.sh
+++ b/build_tools/build_test_cpp.sh
@@ -63,7 +63,7 @@ echo '{
 cd $iree_dir
 CMAKE_ARGS="\
   -GNinja \
-  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_BUILD_TYPE=Debug \
   -DCMAKE_INSTALL_PREFIX=$install_dir \
   -DCMAKE_INSTALL_LIBDIR=lib \
   -DIREE_ERROR_ON_MISSING_SUBMODULES=OFF \
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECombineLogicalObjFifosForConnectionReuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECombineLogicalObjFifosForConnectionReuse.cpp
new file mode 100644
index 000000000..d7c00b260
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECombineLogicalObjFifosForConnectionReuse.cpp
@@ -0,0 +1,52 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "iree-amd-aie/IR/AMDAIEOps.h"
+#include "iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h"
+#include "iree-amd-aie/Transforms/Passes.h"
+#include "mlir/IR/Iterators.h"
+#include "mlir/Pass/Pass.h"
+
+#define DEBUG_TYPE \
+  "iree-amdaie-combine-logical-objectfifos-for-connection-reuse"
+
+namespace mlir::iree_compiler::AMDAIE {
+
+namespace {
+
+class AMDAIECombineLogicalObjFifosForConnectionReusePass
+    : public impl::AMDAIECombineLogicalObjFifosForConnectionReuseBase<
+          AMDAIECombineLogicalObjFifosForConnectionReusePass> {
+ public:
+  using AMDAIECombineLogicalObjFifosForConnectionReuseBase::
+      AMDAIECombineLogicalObjFifosForConnectionReuseBase;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AMDAIEDialect>();
+  }
+  void runOnOperation() override;
+};
+
+void AMDAIECombineLogicalObjFifosForConnectionReusePass::runOnOperation() {
+  ModuleOp moduleOp = getOperation();
+  MLIRContext *context = &getContext();
+  IRRewriter rewriter(context);
+
+  SmallVector<AMDAIE::DmaCpyNdOp> l2ToL1DmaOps =
+      fetchDmaCpyNdOpsToSplitOrCombine(moduleOp);
+
+  if (failed(combineLogicalObjectFifos(rewriter, l2ToL1DmaOps, context))) {
+    return signalPassFailure();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<Pass>
+createAMDAIECombineLogicalObjFifosForConnectionReusePass() {
+  return std::make_unique<AMDAIECombineLogicalObjFifosForConnectionReusePass>();
+}
+
+}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp
index dcba79e2b..2264cbba4 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp
@@ -342,4 +342,17 @@ LogicalResult moveNpuDmaSyncUsersAfterAncestorInSameBlock(
   return success();
 }
 
+/// Utility to fetch a unique CoreOp associated with a L2->L1 Dma op.
+std::optional<CoreOp> fetchUniqueCoreOp(DmaCpyNdOp &l2ToL1DmaOp) {
+  SmallVector<CoreOp> coreOps;
+  for (Operation *userOp : l2ToL1DmaOp->getUsers()) {
+    if (auto coreOp = dyn_cast<CoreOp>(userOp)) {
+      coreOps.push_back(coreOp);
+    }
+  }
+  assert(coreOps.size() == 1 &&
+         "L2->L1 Dma op expected to have a unique Core op");
+  return coreOps[0];
+}
+
 }  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h
index f24ed3196..0ae49d249 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h
@@ -370,6 +370,9 @@ struct DmaDimConfig {
 LogicalResult moveNpuDmaSyncUsersAfterAncestorInSameBlock(
     RewriterBase &rewriter, Operation *parentOp);
 
+/// Utility to fetch a unique CoreOp associated with a L2->L1 Dma op.
+std::optional<CoreOp> fetchUniqueCoreOp(DmaCpyNdOp &l2ToL1DmaOp);
+
 }  // namespace mlir::iree_compiler::AMDAIE
 
 #endif
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp
index 420920a6e..7fb818f42 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp
@@ -470,4 +470,491 @@ LogicalResult splitLogicalObjectFifos(
   return success();
 }
 
+static int64_t fetchOffsetBias(OpFoldResult offsetOpFoldResult) {
+  std::optional<int64_t> offset = getConstantIntValue(offsetOpFoldResult);
+  if (offset) return offset.value();
+  auto offsetVal = cast<Value>(offsetOpFoldResult);
+  auto affineApplyOp =
+      dyn_cast_if_present<affine::AffineApplyOp>(offsetVal.getDefiningOp());
+  if (!affineApplyOp) return 0;
+  AffineMap affineMap = affineApplyOp.getAffineMap();
+  RetrieveScaleAndBias retriever;
+  assert(!failed(retriever.visit(affineMap.getResult(0))) &&
+         "failed to retrieve scale and bias");
+  int64_t bias = 0;
+  if (retriever.bias) {
+    bias = retriever.bias.value();
+  }
+  return bias;
+}
+
+static LogicalResult combineL3ToL2AccessPatterns(
+    RewriterBase &rewriter, const SmallVector<OpFoldResult> &offsetsA,
+    const SmallVector<OpFoldResult> &sizesA,
+    const SmallVector<OpFoldResult> &stridesA,
+    const SmallVector<OpFoldResult> &offsetsB,
+    const SmallVector<OpFoldResult> &sizesB,
+    const SmallVector<OpFoldResult> &stridesB,
+    SmallVector<OpFoldResult> &newOffsets, SmallVector<OpFoldResult> &newSizes,
+    SmallVector<OpFoldResult> &newStrides, SmallVector<int64_t> &combiningDims,
+    SmallVector<int64_t> &nonCombiningDims) {
+  if (offsetsA.empty() && offsetsB.empty()) return success();
+
+  int64_t newSize = 1;
+  for (auto iter : llvm::enumerate(llvm::zip(offsetsA, offsetsB))) {
+    if (iter.index() < combiningDims.size()) continue;
+    const OpFoldResult &offsetA = std::get<0>(iter.value());
+    const OpFoldResult &offsetB = std::get<1>(iter.value());
+    if (offsetA != offsetB) {
+      // Need to check the difference in bias here.
+      int64_t biasA = fetchOffsetBias(offsetA);
+      int64_t biasB = fetchOffsetBias(offsetB);
+      std::optional<int64_t> sizeA = getConstantIntValue(sizesA[iter.index()]);
+      assert(sizeA && "expected a constant integer value for size");
+      if (sizeA != biasB - biasA) return failure();
+      newSize++;
+    }
+  }
+  newSizes[combiningDims.size() - 1] = rewriter.getI64IntegerAttr(newSize);
+  return success();
+}
+
+static FailureOr<LogicalObjectFifoFromMemrefOp> combineL3ToL2Pair(
+    IRRewriter &rewriter, DmaCpyNdOp dmaOpA, DmaCpyNdOp dmaOpB,
+    SmallVector<int64_t> &combiningDims,
+    SmallVector<int64_t> &nonCombiningDims) {
+  OpBuilder::InsertionGuard guard(rewriter);
+  SmallVector<OpFoldResult> sourceOffsetsA = dmaOpA.getSourceMixedOffsets();
+  SmallVector<OpFoldResult> sourceSizesA = dmaOpA.getSourceMixedSizes();
+  SmallVector<OpFoldResult> sourceStridesA = dmaOpA.getSourceMixedStrides();
+  SmallVector<OpFoldResult> sourceOffsetsB = dmaOpB.getSourceMixedOffsets();
+  SmallVector<OpFoldResult> sourceSizesB = dmaOpB.getSourceMixedSizes();
+  SmallVector<OpFoldResult> sourceStridesB = dmaOpB.getSourceMixedStrides();
+
+  SmallVector<OpFoldResult> targetOffsetsA = dmaOpA.getTargetMixedOffsets();
+  SmallVector<OpFoldResult> targetSizesA = dmaOpA.getTargetMixedSizes();
+  SmallVector<OpFoldResult> targetStridesA = dmaOpA.getTargetMixedStrides();
+  SmallVector<OpFoldResult> targetOffsetsB = dmaOpB.getTargetMixedOffsets();
+  SmallVector<OpFoldResult> targetSizesB = dmaOpB.getTargetMixedSizes();
+  SmallVector<OpFoldResult> targetStridesB = dmaOpB.getTargetMixedStrides();
+
+  SmallVector<OpFoldResult> newSourceOffsets = sourceOffsetsA;
+  SmallVector<OpFoldResult> newSourceSizes = sourceSizesA;
+  SmallVector<OpFoldResult> newSourceStrides = sourceStridesA;
+  if (failed(combineL3ToL2AccessPatterns(
+          rewriter, sourceOffsetsA, sourceSizesA, sourceStridesA,
+          sourceOffsetsB, sourceSizesB, sourceStridesB, newSourceOffsets,
+          newSourceSizes, newSourceStrides, combiningDims, nonCombiningDims))) {
+    dmaOpA->emitOpError()
+        << "L3->L2 pair cannot be combined because offset is not contiguous";
+    return failure();
+  }
+
+  SmallVector<OpFoldResult> newTargetOffsets = targetOffsetsA;
+  SmallVector<OpFoldResult> newTargetSizes = newSourceSizes;
+  SmallVector<OpFoldResult> newTargetStrides = targetStridesA;
+  // Now we need to create a new L2 buffer based on `newTargetSizes`.
+  LogicalObjectFifoFromMemrefOp oldL2ObjectFifo = dmaOpA.getTargetObjectFifo();
+  AMDAIE::LogicalObjectFifoFromMemrefOp newL2ObjectFifo =
+      createNewLogicalObjectFifo(rewriter, oldL2ObjectFifo, newTargetSizes);
+
+  // Create combined L3->L2 Dma.
+  rewriter.setInsertionPoint(dmaOpA);
+  auto combinedL3ToL2DmaOp = rewriter.create<AMDAIE::DmaCpyNdOp>(
+      dmaOpA.getLoc(), newL2ObjectFifo, llvm::ArrayRef(newTargetOffsets),
+      llvm::ArrayRef(newTargetSizes), llvm::ArrayRef(newTargetStrides),
+      dmaOpA.getSource(), llvm::ArrayRef(newSourceOffsets),
+      llvm::ArrayRef(newSourceSizes), llvm::ArrayRef(newSourceStrides));
+  // Replace the uses of 2nd L3->L2 Dma with the new combined L3->L2 Dma
+  // and erase the 1st L3->L2 Dma.
+  rewriter.replaceOp(dmaOpB, combinedL3ToL2DmaOp);
+  rewriter.eraseOp(dmaOpA);
+  return newL2ObjectFifo;
+}
+
+/// Utility comparator function that compares two DmaCpyNd ops `a` and `b`.
+/// Returns true if `a`'s offset is "less" than `b`'s. The following explains
+/// the notion of one offset being "less" than the other :-
+///     Offset A : N-dimension array.
+///     Offset B : N-dimension array.
+///     Then, A < B if :-
+///             A[i] < B[i] for `i` in [0, N-1]
+///     AND     A[0..i-1] == B[0..i-1]
+static bool compareL3ToL2DmaPairOffsets(DmaCpyNdOp &a, DmaCpyNdOp &b) {
+  SmallVector<OpFoldResult> sourceOffsetsA = a.getSourceMixedOffsets();
+  SmallVector<OpFoldResult> sourceSizesA = a.getSourceMixedSizes();
+  SmallVector<OpFoldResult> sourceOffsetsB = b.getSourceMixedOffsets();
+  SmallVector<OpFoldResult> sourceSizesB = b.getSourceMixedSizes();
+  // We'll add assertion checks on the size before invoking this function.
+  for (int64_t i = 0, n = sourceOffsetsA.size(); i < n; i++) {
+    std::optional<int64_t> offsetA = getConstantIntValue(sourceOffsetsA[i]);
+    std::optional<int64_t> offsetB = getConstantIntValue(sourceOffsetsB[i]);
+    if (offsetA && offsetB) {
+      if (offsetA < offsetB) return true;
+      if (offsetA > offsetB) return false;
+      continue;
+    }
+    if (!offsetA && !offsetB) {
+      auto offsetValA = cast<Value>(sourceOffsetsA[i]);
+      auto offsetValB = cast<Value>(sourceOffsetsB[i]);
+      auto affineApplyOpA = dyn_cast_if_present<affine::AffineApplyOp>(
+          offsetValA.getDefiningOp());
+      auto affineApplyOpB = dyn_cast_if_present<affine::AffineApplyOp>(
+          offsetValB.getDefiningOp());
+      // TODO(avarma): This should be handled better. The overall possibility
+      // here already makes this complex enough.
+      assert(affineApplyOpA && "expected affine.apply op");
+      assert(affineApplyOpB && "expected affine.apply op");
+      for (auto &&[valA, valB] :
+           llvm::zip_equal(affineApplyOpA.getMapOperands(),
+                           affineApplyOpB.getMapOperands())) {
+        assert((valA == valB) &&
+               "different base values being operated on between the L3->L2 Dma "
+               "op pair");
+      }
+      AffineMap affineMapA = affineApplyOpA.getAffineMap();
+      AffineMap affineMapB = affineApplyOpB.getAffineMap();
+      RetrieveScaleAndBias retrieverA, retrieverB;
+      assert(!failed(retrieverA.visit(affineMapA.getResult(0))) &&
+             "failed to retrieve scale and bias");
+      assert(!failed(retrieverB.visit(affineMapB.getResult(0))) &&
+             "failed to retrieve scale and bias");
+      int64_t biasA = 0, biasB = 0;
+      if (retrieverA.bias) {
+        biasA = retrieverA.bias.value();
+      }
+      if (retrieverB.bias) {
+        biasB = retrieverB.bias.value();
+      }
+      // TODO(avarma): We should also check the scale value as well.
+      if (biasA < biasB) return true;
+      if (biasA > biasB) return false;
+      continue;
+    }
+    assert(false &&
+           "unexpected combination of offset val amongst L3->L2 Dma pair");
+  }
+  return false;
+}
+
+static bool areAccessPatternsCompatibleForCombining(
+    AMDAIE::DmaCpyNdOp &l3ToL2DmaOpA, AMDAIE::DmaCpyNdOp &l3ToL2DmaOpB) {
+  // Sources' access pattern check.
+  SmallVector<OpFoldResult> sourceOffsetsA =
+      l3ToL2DmaOpA.getSourceMixedOffsets();
+  SmallVector<OpFoldResult> sourceSizesA = l3ToL2DmaOpA.getSourceMixedSizes();
+  SmallVector<OpFoldResult> sourceStridesA =
+      l3ToL2DmaOpA.getSourceMixedStrides();
+  SmallVector<OpFoldResult> sourceOffsetsB =
+      l3ToL2DmaOpB.getSourceMixedOffsets();
+  SmallVector<OpFoldResult> sourceSizesB = l3ToL2DmaOpB.getSourceMixedSizes();
+  SmallVector<OpFoldResult> sourceStridesB =
+      l3ToL2DmaOpB.getSourceMixedStrides();
+  if (sourceOffsetsA.size() != sourceOffsetsB.size() ||
+      sourceSizesA.size() != sourceSizesB.size() ||
+      sourceStridesA.size() != sourceStridesB.size() ||
+      sourceOffsetsA.size() != sourceSizesA.size() ||
+      sourceOffsetsA.size() != sourceStridesB.size()) {
+    return false;
+  }
+  // Targets' access pattern check.
+  SmallVector<OpFoldResult> targetOffsetsA =
+      l3ToL2DmaOpA.getTargetMixedOffsets();
+  SmallVector<OpFoldResult> targetSizesA = l3ToL2DmaOpA.getTargetMixedSizes();
+  SmallVector<OpFoldResult> targetStridesA =
+      l3ToL2DmaOpA.getTargetMixedStrides();
+  SmallVector<OpFoldResult> targetOffsetsB =
+      l3ToL2DmaOpB.getTargetMixedOffsets();
+  SmallVector<OpFoldResult> targetSizesB = l3ToL2DmaOpB.getTargetMixedSizes();
+  SmallVector<OpFoldResult> targetStridesB =
+      l3ToL2DmaOpB.getTargetMixedStrides();
+  if (targetOffsetsA.size() != targetOffsetsB.size() ||
+      targetSizesA.size() != targetSizesB.size() ||
+      targetStridesA.size() != targetStridesB.size() ||
+      targetOffsetsA.size() != targetSizesA.size() ||
+      targetOffsetsA.size() != targetStridesB.size()) {
+    return false;
+  }
+  // Checking if targets' access pattern values are same.
+  auto isSameValue = [](SmallVector<OpFoldResult> &accessPatternA,
+                        SmallVector<OpFoldResult> &accessPatternB) -> bool {
+    for (auto [a, b] : llvm::zip_equal(accessPatternA, accessPatternB)) {
+      if (a != b) return false;
+    }
+    return true;
+  };
+  if (isSameValue(targetOffsetsA, targetOffsetsB) &&
+      isSameValue(targetSizesA, targetSizesB) &&
+      isSameValue(targetStridesA, targetStridesB)) {
+    return true;
+  }
+
+  return false;
+}
+
+static LogicalResult fetchCombiningDimensions(
+    SmallVector<AMDAIE::DmaCpyNdOp> &l3ToL2DmaOps,
+    SmallVector<int64_t> &combiningDims,
+    SmallVector<int64_t> &nonCombiningDims) {
+  // Fetch combining/non-combining dimensions. Currently we look for a
+  // continuous sequence of 0 offset dims with size as 1 to infer them as
+  // combining dimensions.
+  int64_t maxCombiningDimIndex = 0;
+  for (unsigned i = 0, n = l3ToL2DmaOps.size(); i < n; i++) {
+    SmallVector<OpFoldResult> sourceOffsets =
+        l3ToL2DmaOps[i].getSourceMixedOffsets();
+    SmallVector<OpFoldResult> sourceSizes =
+        l3ToL2DmaOps[i].getSourceMixedSizes();
+    unsigned j = 0, m = sourceOffsets.size();
+    // Traverse through the i-th L3->L2 Dma op's source offset/size to find a
+    // continuous sequence of 0 offset dims with size as 1.
+    while (j < m) {
+      std::optional<int64_t> constantOffset =
+          getConstantIntValue(sourceOffsets[j]);
+      if (!constantOffset || constantOffset.value() != 0) {
+        break;
+      }
+      std::optional<int64_t> constantSize = getConstantIntValue(sourceSizes[j]);
+      if (!constantSize || constantSize.value() != 1) {
+        break;
+      }
+      j++;
+    }
+    if (i == 0) {
+      maxCombiningDimIndex = j;
+    } else if (maxCombiningDimIndex != j) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "incompatible combining dimensions across L3->L2\n");
+      return failure();
+    }
+  }
+  combiningDims.assign(maxCombiningDimIndex, 0);
+  std::iota(combiningDims.begin(), combiningDims.end(), 0);
+  nonCombiningDims.assign(maxCombiningDimIndex, 0);
+  std::iota(nonCombiningDims.begin(), nonCombiningDims.end(),
+            combiningDims.size());
+  return success();
+}
+
+/// Given a vector of L2->L1 Dma Ops, combine the corresponding L3->L2 Dma Ops
+/// and reuse the L2/L1 buffers.
+/// TODO(avarma): Assign combined tiles while forming L2/L1 buffers which we'll
+/// reuse.
+LogicalResult combineLogicalObjectFifos(
+    IRRewriter &rewriter, SmallVector<AMDAIE::DmaCpyNdOp> &l2ToL1DmaOps,
+    MLIRContext *context) {
+  if (l2ToL1DmaOps.size() == 0) return success();
+
+  // Fetch the L3 -> L2 Dma Op corresponding to the first L2 buffer as target.
+  SmallVector<AMDAIE::DmaCpyNdOp> l3ToL2DmaOps;
+  FailureOr<AMDAIE::DmaCpyNdOp> maybeL3ToL2DmaOp =
+      fetchL3ToL2DmaCpyNdOp(l2ToL1DmaOps[0]);
+  if (failed(maybeL3ToL2DmaOp)) return failure();
+  l3ToL2DmaOps.push_back(maybeL3ToL2DmaOp.value());
+
+  // Check that all L3 buffer associated with the different L3->L2 Dma ops are
+  // same.
+  for (unsigned i = 1, n = l2ToL1DmaOps.size(); i < n; i++) {
+    maybeL3ToL2DmaOp = fetchL3ToL2DmaCpyNdOp(l2ToL1DmaOps[i]);
+    if (failed(maybeL3ToL2DmaOp)) return failure();
+    l3ToL2DmaOps.push_back(maybeL3ToL2DmaOp.value());
+    if (l3ToL2DmaOps[0].getSourceObjectFifo() !=
+        l3ToL2DmaOps[i].getSourceObjectFifo()) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Found different L3 objectFifo for " << l3ToL2DmaOps[0]
+                 << " and " << l3ToL2DmaOps[i] << "\n");
+      return failure();
+    }
+    if (!areAccessPatternsCompatibleForCombining(l3ToL2DmaOps[0],
+                                                 l3ToL2DmaOps[i])) {
+      LLVM_DEBUG(
+          llvm::dbgs()
+          << "access patterns failed compatibility checks for combining\n");
+      return failure();
+    }
+  }
+
+  if (l2ToL1DmaOps.size() != l3ToL2DmaOps.size()) {
+    LLVM_DEBUG(
+        llvm::dbgs()
+        << "expected 1:1 correspondence between L3->L2 and L2->L1 Dma ops\n");
+    return failure();
+  }
+
+  SmallVector<int64_t> combiningDims, nonCombiningDims;
+  if (failed(fetchCombiningDimensions(l3ToL2DmaOps, combiningDims,
+                                      nonCombiningDims))) {
+    return failure();
+  }
+
+  // At this point it's nice to perhaps just sort the L3->L2 Dma ops based on
+  // the "overlapping" offsets. And we'll sort the corresponding L2->L1 Dma ops
+  // accordingly.
+  for (int64_t i = 1, n = l3ToL2DmaOps.size(); i < n; i++) {
+    DmaCpyNdOp currL3ToL2DmaOp = l3ToL2DmaOps[i];
+    DmaCpyNdOp currL2ToL1DmaOp = l2ToL1DmaOps[i];
+    int64_t j = i - 1;
+    while (j >= 0 &&
+           compareL3ToL2DmaPairOffsets(currL3ToL2DmaOp, l3ToL2DmaOps[j])) {
+      l3ToL2DmaOps[j + 1] = l3ToL2DmaOps[j];
+      l2ToL1DmaOps[j + 1] = l2ToL1DmaOps[j];
+      j--;
+    }
+    l3ToL2DmaOps[j + 1] = currL3ToL2DmaOp;
+    l2ToL1DmaOps[j + 1] = currL2ToL1DmaOp;
+  }
+
+  // Currently we have 4 cores so there are two pairs of DmaCpyNds to combine.
+  // TODO(avarma): Revisit this later when we want to target more no. of cores.
+  if (l3ToL2DmaOps.size() % 2 != 0) {
+    LLVM_DEBUG(llvm::dbgs() << "found uneven L3->L2 ops for combining\n");
+    return failure();
+  }
+
+  auto createL2ToL1ForReuse =
+      [](IRRewriter &rewriter, DmaCpyNdOp &l2ToL1DmaOp,
+         LogicalObjectFifoFromMemrefOp &reuseL1Buffer,
+         LogicalObjectFifoFromMemrefOp &reuseL2Buffer,
+         SmallVector<OpFoldResult> &newL2SourceOffsets) -> DmaCpyNdOp {
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPoint(l2ToL1DmaOp);
+    auto newL2ToL1DmaOp = rewriter.create<AMDAIE::DmaCpyNdOp>(
+        l2ToL1DmaOp.getLoc(), reuseL1Buffer,
+        l2ToL1DmaOp.getTargetMixedOffsets(), l2ToL1DmaOp.getTargetMixedSizes(),
+        l2ToL1DmaOp.getTargetMixedStrides(), reuseL2Buffer,
+        llvm::ArrayRef(newL2SourceOffsets), l2ToL1DmaOp.getSourceMixedSizes(),
+        l2ToL1DmaOp.getSourceMixedStrides());
+    rewriter.replaceOp(l2ToL1DmaOp, newL2ToL1DmaOp);
+    return newL2ToL1DmaOp;
+  };
+  // Till this point, we've created a L3->L2 DmaCpyNd chain that is sorted based
+  // on increasing offsets. Refer to `compareL3ToL2DmaPairOffsets`'s doc comment
+  // for the same. Now, we'll be picking up pairs of such DmaCpyNd ops from the
+  // chain, eg: pair[0,1], then pair[2,3], etc. For each such pair[i, i+1] we
+  // will make an attempt to combine the logical objectFifos as per the
+  // following algorithm :-
+  //  a. Combine i-th and i+1-th L3->L2 DmaCpyNd ops.
+  //  b. Form reusable L1 buffer by assigning the cumulative tiles of the
+  //     intended core ops.
+  //  c. Since step a  would create a new L2 buffer (with combined shape), we
+  //     will need to update the corresponding two L2->L1 Dma ops by indeed
+  //     creating new ones. NOTE: Both of these new L2->L1 Dma ops will be
+  //     reusing the same L1 buffers as well.
+  //  d. Now pick the unique core ops corresponding to i-th and i+1-th L2->L1
+  //     Dma ops and do the following :-
+  //      1. For i-th CoreOp insert an AccessOp from the same L1 buffer towards
+  //         the end.
+  //      2. For i+1-th CoreOp insert an AccessOp from the same L1 buffer right
+  //         before the corresponding AccessOp within the same CoreOp.
+  for (unsigned i = 0, n = l3ToL2DmaOps.size(); i < n; i += 2) {
+    // Step 1. Combine the picked L3->L2 DmaCpyNd pair.
+    FailureOr<LogicalObjectFifoFromMemrefOp> maybeNewL2ObjectFifo =
+        combineL3ToL2Pair(rewriter, l3ToL2DmaOps[i], l3ToL2DmaOps[i + 1],
+                          combiningDims, nonCombiningDims);
+    if (failed(maybeNewL2ObjectFifo)) return failure();
+    LogicalObjectFifoFromMemrefOp newL2ObjectFifo =
+        maybeNewL2ObjectFifo.value();
+
+    LogicalObjectFifoFromMemrefOp oldFirstL1ObjFifoOp =
+        l2ToL1DmaOps[i].getTargetObjectFifo();
+    LogicalObjectFifoFromMemrefOp oldSecondL1ObjFifoOp =
+        l2ToL1DmaOps[i + 1].getTargetObjectFifo();
+    // Step 2. Form the reusable L1 buffer by assigning the cumulative tiles of
+    // the intended core ops.
+    LogicalObjectFifoFromMemrefOp reuseL1LogicalObjectFifoOp =
+        l2ToL1DmaOps[i].getTargetObjectFifo();
+    SmallVector<Value> tiles;
+    auto addNewTileFrom = [&](CoreOp coreOp) -> LogicalResult {
+      OpBuilder::InsertionGuard guard(rewriter);
+      TileOp tileOp = coreOp.getTileOp();
+      std::optional<int64_t> column = getConstantIntValue(tileOp.getCol());
+      std::optional<int64_t> row = getConstantIntValue(tileOp.getRow());
+      if (!column || !row) {
+        return coreOp.emitOpError() << "has non-constant tile location";
+      }
+      rewriter.setInsertionPoint(reuseL1LogicalObjectFifoOp);
+      auto colIndex = rewriter.create<arith::ConstantIndexOp>(
+          rewriter.getUnknownLoc(), *column);
+      auto rowIndex = rewriter.create<arith::ConstantIndexOp>(
+          rewriter.getUnknownLoc(), *row);
+      tileOp =
+          rewriter.create<TileOp>(rewriter.getUnknownLoc(), colIndex, rowIndex);
+      tiles.push_back(tileOp.getResult());
+      return success();
+    };
+    std::optional<CoreOp> maybeFirstCoreOp = fetchUniqueCoreOp(l2ToL1DmaOps[i]);
+    if (!maybeFirstCoreOp) return failure();
+    CoreOp firstCoreOp = maybeFirstCoreOp.value();
+    std::optional<CoreOp> maybeSecondCoreOp =
+        fetchUniqueCoreOp(l2ToL1DmaOps[i + 1]);
+    if (!maybeSecondCoreOp) return failure();
+    CoreOp secondCoreOp = maybeSecondCoreOp.value();
+    if (failed(addNewTileFrom(firstCoreOp)) ||
+        failed(addNewTileFrom(secondCoreOp))) {
+      return failure();
+    }
+    llvm::sort(tiles.begin(), tiles.end(),
+               AMDAIE::TileOp::tileValueColumnAndRowComparator);
+    rewriter.setInsertionPoint(reuseL1LogicalObjectFifoOp);
+    reuseL1LogicalObjectFifoOp = rewriter.create<LogicalObjectFifoFromMemrefOp>(
+        reuseL1LogicalObjectFifoOp.getLoc(),
+        cast<LogicalObjectFifoType>(
+            reuseL1LogicalObjectFifoOp.getOutput().getType()),
+        reuseL1LogicalObjectFifoOp.getMemref(), tiles);
+
+    // Step 3. We now have need to create two L2->L1 ops since the size has
+    // changed. But for this we first need to find the new offset for L2 as
+    // source.
+    // TODO: For now I'm hardcoding the offsets but later it'd just depend
+    // on combining/non-combining dimensions.
+    // Offset = 0,0
+    SmallVector<OpFoldResult> newL2AsSourceOffsets =
+        l2ToL1DmaOps[i].getSourceMixedOffsets();
+    createL2ToL1ForReuse(rewriter, l2ToL1DmaOps[i], reuseL1LogicalObjectFifoOp,
+                         newL2ObjectFifo, newL2AsSourceOffsets);
+    // Offset = 0, 1. NOTE here we'd use the same L1 logical objectFifo as
+    // the first L2->L1 Dma.
+    newL2AsSourceOffsets = l2ToL1DmaOps[i + 1].getSourceMixedOffsets();
+    newL2AsSourceOffsets[1] = rewriter.getIndexAttr(1);
+    createL2ToL1ForReuse(rewriter, l2ToL1DmaOps[i + 1],
+                         reuseL1LogicalObjectFifoOp, newL2ObjectFifo,
+                         newL2AsSourceOffsets);
+
+    // Step 4. Pick the CoreOps associated with the 1:1 L2->L1.
+    // For the first Core op we'll insert Read at the end. It doesn't matter
+    // for now so we're gonna insert it right before amdaie.end op.
+    firstCoreOp.walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) {
+      if (accessOp.getInput() == oldFirstL1ObjFifoOp) {
+        OpBuilder::InsertionGuard guard(rewriter);
+        rewriter.setInsertionPointAfter(accessOp);
+        rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
+            rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(),
+            accessOp.getAccessType());
+      }
+    });
+    rewriter.replaceOp(oldFirstL1ObjFifoOp, reuseL1LogicalObjectFifoOp);
+    // For the second Core op we'll insert `Read` right before the first read
+    // from the corresponding L1 logicalobjectFifo.
+    secondCoreOp.walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) {
+      if (accessOp.getInput() == oldSecondL1ObjFifoOp) {
+        OpBuilder::InsertionGuard guard(rewriter);
+        rewriter.setInsertionPoint(accessOp);
+        rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
+            rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(),
+            accessOp.getAccessType());
+        // Need to insert the second one because THIS is what will actually
+        // be used.
+        auto secondAccessOp =
+            rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
+                rewriter.getUnknownLoc(),
+                reuseL1LogicalObjectFifoOp.getOutput(),
+                accessOp.getAccessType());
+        rewriter.replaceOp(accessOp, secondAccessOp);
+      }
+    });
+  }
+
+  return success();
+}
+
 }  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h
index f9339b2ac..f0ed234f1 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h
@@ -19,6 +19,11 @@ LogicalResult splitLogicalObjectFifos(
     IRRewriter &rewriter, SmallVector<AMDAIE::DmaCpyNdOp> &l2ToL1DmaOps,
     MLIRContext *context);
 
+/// Utility to combine logicalobjectfifos given a vector of L2->L1 dma ops.
+LogicalResult combineLogicalObjectFifos(
+    IRRewriter &rewriter, SmallVector<AMDAIE::DmaCpyNdOp> &l2ToL1DmaOps,
+    MLIRContext *context);
+
 }  // namespace mlir::iree_compiler::AMDAIE
 
 #endif
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
index 2979c71ef..ca7fc9bd5 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
@@ -53,6 +53,7 @@ iree_cc_library(
     "AMDAIECanonicalizeDma.cpp"
     "AMDAIECanonicalizeNpuDmaCpyNd.cpp"
     "AMDAIECanonicalizeDoublyStridedOp.cpp"
+    "AMDAIECombineLogicalObjFifosForConnectionReuse.cpp"
     "AMDAIECombineStridedOps.cpp"
     "AMDAIEControlCodeLoopUnroll.cpp"
     "AMDAIEConvertCoreForallToFor.cpp"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
index 8912db52d..46793bc34 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
@@ -31,6 +31,7 @@ namespace mlir::iree_compiler::AMDAIE {
 #define GEN_PASS_DEF_AMDAIECANONICALIZEDOUBLYSTRIDEDOP
 #define GEN_PASS_DEF_AMDAIECANONICALIZENPUDMACPYND
 #define GEN_PASS_DEF_AMDAIECLEANUP
+#define GEN_PASS_DEF_AMDAIECOMBINELOGICALOBJFIFOSFORCONNECTIONREUSE
 #define GEN_PASS_DEF_AMDAIECOMBINESTRIDEDOPS
 #define GEN_PASS_DEF_AMDAIECONTROLCODELOOPUNROLL
 #define GEN_PASS_DEF_AMDAIECONVERTCOREFORALLTOFOR
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
index fe5670067..585af50b4 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
@@ -87,6 +87,10 @@ std::unique_ptr<Pass> createAMDAIECanonicalizeNpuDmaCpyNdPass();
 std::unique_ptr<Pass> createAMDAIECanonicalizeDoublyStridedOpPass(
     AMDAIECanonicalizeDoublyStridedOpOptions options = {});
 
+/// Create a pass to combine logicalobjectfifos for connection reuse.
+std::unique_ptr<Pass>
+createAMDAIECombineLogicalObjFifosForConnectionReusePass();
+
 /// Pass to unroll the loops within the control code regions.
 std::unique_ptr<Pass> createAMDAIEControlCodeLoopUnrollPass();
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
index 73ceee040..19f3a2608 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
@@ -123,6 +123,12 @@ def AMDAIECleanup :
       "mlir::iree_compiler::AMDAIE::createAMDAIECleanupPass()";
 }
 
+def AMDAIECombineLogicalObjFifosForConnectionReuse :
+  Pass<"iree-amdaie-combine-logical-objectfifos-for-connection-reuse", "ModuleOp"> {
+  let summary = "Pass to combine L2 buffers to share inputs of Matmul and Elementwise operations.";
+  let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIECombineLogicalObjFifosForConnectionReusePass()";
+}
+
 def AMDAIECombineStridedOps :
     Pass<"iree-amdaie-combine-strided-ops", ""> {
   let summary = "Combine strided ops in same block if access patterns are compatible.";
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
index ba4380860..261a8068c 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
@@ -19,6 +19,7 @@ iree_lit_test_suite(
     "canonicalize_dma.mlir"
     "canonicalize_doubly_strided_op.mlir"
     "canonicalize_npu_dma_cpy_nd.mlir"
+    "combine_logicalobjfifos_for_connection_reuse.mlir"
     "combine_strided_ops.mlir"
     "controlcode_loop_unrolling.mlir"
     "convert_core_forall_to_for.mlir"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir
new file mode 100644
index 000000000..a493efdce
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir
@@ -0,0 +1,219 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-combine-logical-objectfifos-for-connection-reuse,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s
+
+//   CHECK-DAG: #map = affine_map<(d0) -> (d0 * 64)>
+//   CHECK-DAG: #map1 = affine_map<(d0) -> (d0 * 64 + 32)>
+//       CHECK: @combine_logical_objFifos
+//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
+//       CHECK:   memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+//       CHECK:   %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+//       CHECK:   %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+//       CHECK:   %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32>
+//   CHECK-DAG:   %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+//   CHECK-DAG:   %[[TILE_0:.*]] = amdaie.tile(%[[C1]], %[[C3]])
+//   CHECK-DAG:   %[[TILE_1:.*]] = amdaie.tile(%[[C0]], %[[C2]])
+//   CHECK-DAG:   %[[TILE_2:.*]] = amdaie.tile(%[[C1]], %[[C2]])
+//   CHECK-DAG:   %[[TILE_3:.*]] = amdaie.tile(%[[C0]], %[[C3]])
+//       CHECK:   %[[L2_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_0]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
+//       CHECK:   %[[L2_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_1]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
+//       CHECK:   %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+//       CHECK:   scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2)
+//   CHECK-DAG:       %[[IV1_0:.*]] = affine.apply #map(%[[IV1]])
+//   CHECK-DAG:       %[[IV0_0:.*]] = affine.apply #map(%[[IV0]])
+//   CHECK-DAG:       %[[IV0_32:.*]] = affine.apply #map1(%[[IV0]])
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, %[[IV0_0]], %[[IV1_0]]] [1, 2, 32, 32] [4096, 32, 128, 1]
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32]], %[[IV1_0]]] [1, 2, 32, 32] [4096, 32, 128, 1]
+//       CHECK:       %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]], %[[TILE_0]]}
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:         %[[FIRST_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
+//       CHECK:         amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
+//       CHECK:         linalg.generic
+//  CHECK-SAME:             %[[FIRST_READ]]
+//       CHECK:         amdaie.end
+//       CHECK:       }
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L2_OBJECTFIFO_0]][0, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:         amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
+//       CHECK:         %[[SECOND_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
+//       CHECK:         linalg.generic
+//  CHECK-SAME:             %[[SECOND_READ]]
+//       CHECK:         amdaie.end
+//       CHECK:       }
+//       CHECK:       %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_3]], %[[TILE_2]]}
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_2:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_2]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_2]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:         %[[FIRST_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
+//       CHECK:         amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
+//       CHECK:         linalg.generic
+//  CHECK-SAME:             %[[FIRST_READ]]
+//       CHECK:         amdaie.end
+//       CHECK:       }
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_3:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L2_OBJECTFIFO_1]][0, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_3]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_3]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:         amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
+//       CHECK:         %[[SECOND_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
+//       CHECK:         linalg.generic
+//  CHECK-SAME:             %[[SECOND_READ]]
+//       CHECK:         amdaie.end
+//       CHECK:       }
+#map = affine_map<(d0) -> (d0 * 64)>
+#map1 = affine_map<(d0) -> (d0 * 64 + 32)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>
+#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>
+#map4 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>
+#map5 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
+module {
+  func.func @combine_logical_objFifos(%arg0: !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, %arg2: !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, %arg3: !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>) {
+    %c3 = arith.constant 3 : index
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
+    %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+    %alloc_2 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+    %alloc_3 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+    %alloc_4 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+    %alloc_5 = memref.alloc() : memref<128x128xi32>
+    %alloc_6 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+    %tile = amdaie.tile(%c1, %c3)
+    %tile_7 = amdaie.tile(%c0, %c2)
+    %tile_8 = amdaie.tile(%c1, %c2)
+    %tile_9 = amdaie.tile(%c0, %c3)
+    %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+    %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+    %2 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+    %3 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+    %4 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+    scf.forall (%arg4, %arg5) in (2, 2) {
+      %5 = affine.apply #map(%arg5)
+      %6 = affine.apply #map1(%arg5)
+      %7 = affine.apply #map(%arg4)
+      %8 = affine.apply #map1(%arg4)
+      %9 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %7, %5] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %10 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %7, %6] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %11 = amdaie.dma_cpy_nd(%2[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %8, %5] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %12 = amdaie.dma_cpy_nd(%3[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %8, %6] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %13 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+      %14 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
+      %15 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %13[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %16 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %14[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
+      %17 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %18 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_7} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %19 = amdaie.dma_cpy_nd(%18[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
+      %20 = amdaie.core(%tile_7, in : [%15, %16, %19], out : [%17]) {
+        %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.muli %in, %in_10 : i32
+          %36 = arith.addi %out, %35 : i32
+          linalg.yield %36 : i32
+        }
+        %33 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.addi %in, %in_10 : i32
+          linalg.yield %35 : i32
+        }
+        amdaie.end
+      }
+      %21 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %22 = amdaie.dma_cpy_nd(%21[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %1[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
+      %23 = amdaie.core(%tile, in : [%15, %16, %22], out : [%17]) {
+        %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.muli %in, %in_10 : i32
+          %36 = arith.addi %out, %35 : i32
+          linalg.yield %36 : i32
+        }
+        %33 = amdaie.logicalobjectfifo.access(%21, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.addi %in, %in_10 : i32
+          linalg.yield %35 : i32
+        }
+        amdaie.end
+      }
+      %24 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_8} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %25 = amdaie.dma_cpy_nd(%24[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
+      %26 = amdaie.core(%tile_8, in : [%15, %16, %25], out : [%17]) {
+        %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.muli %in, %in_10 : i32
+          %36 = arith.addi %out, %35 : i32
+          linalg.yield %36 : i32
+        }
+        %33 = amdaie.logicalobjectfifo.access(%24, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.addi %in, %in_10 : i32
+          linalg.yield %35 : i32
+        }
+        amdaie.end
+      }
+      %27 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_9} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %28 = amdaie.dma_cpy_nd(%27[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %3[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
+      %29 = amdaie.core(%tile_9, in : [%15, %16, %28], out : [%17]) {
+        %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.muli %in, %in_10 : i32
+          %36 = arith.addi %out, %35 : i32
+          linalg.yield %36 : i32
+        }
+        %33 = amdaie.logicalobjectfifo.access(%27, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_10: i32, %out: i32):
+          %35 = arith.addi %in, %in_10 : i32
+          linalg.yield %35 : i32
+        }
+        amdaie.end
+      }
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32>
+    memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_5 : memref<128x128xi32>
+    memref.dealloc %alloc_1 : memref<1x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_2 : memref<1x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_3 : memref<1x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_4 : memref<1x1x32x32xi32, 1 : i32>
+    return
+  }
+}