From 3a6f183c6213ca8b8b3655f3297c654306f62bdc Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Mon, 2 Sep 2024 19:21:01 +0530
Subject: [PATCH] [ObjectFifo] Create a new pass to split logical objectFifos 
 (#659)

-- This commit introduces a new pass
`--iree-amdaie-split-logical-objectfifos-for-connection-reuse` to
split logical objectFifos for dealing with Matmul+Elementwise.
-- Also contains a utility to check whether splitting can be performed.
-- It addresses sub-action 2 as well from
https://github.com/nod-ai/iree-amd-aie/issues/644

Signed-off-by: Abhishek Varma <abhvarma@amd.com>
---
 .../AMDAIELogicalObjFifoSplittingUtils.cpp    |  430 +++++
 .../AMDAIELogicalObjFifoSplittingUtils.h      |   23 +
 ...SplitLogicalObjFifosForConnectionReuse.cpp |   71 +
 .../iree-amd-aie/Transforms/CMakeLists.txt    |    2 +
 .../iree-amd-aie/Transforms/PassDetail.h      |    1 +
 .../iree-amd-aie/Transforms/Passes.cpp        |    1 +
 .../AMD-AIE/iree-amd-aie/Transforms/Passes.h  |    3 +
 .../AMD-AIE/iree-amd-aie/Transforms/Passes.td |    6 +
 .../Transforms/test/CMakeLists.txt            |    1 +
 ..._logicalobjfifos_for_connection_reuse.mlir | 1398 +++++++++++++++++
 10 files changed, 1936 insertions(+)
 create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp
 create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h
 create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp
 create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos_for_connection_reuse.mlir
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp
new file mode 100644
index 000000000..d09a9e746
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp
@@ -0,0 +1,430 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "AMDAIELogicalObjFifoSplittingUtils.h"
+
+#include <numeric>
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Iterators.h"
+#include "mlir/IR/Operation.h"
+
+#define DEBUG_TYPE "iree-amdaie-logicalobjfifo-splitting-utils"
+
+namespace mlir::iree_compiler::AMDAIE {
+
+/// Utility to verify that the split dimensions for L2 are contiguous.
+static LogicalResult checkIsRangeFromZero(
+    SmallVector<size_t> &splitDimsSetForL2) {
+  for (auto &&[dim, splitDim] : llvm::enumerate(splitDimsSetForL2)) {
+    if (splitDim != dim) return failure();
+  }
+  return success();
+}
+
+/// This utility helps to perform the computation of offsets for L3 source.
+///
+/// Example:
+/// For L3 -> L2 DmaCpyNd :-
+/// From offset (0,0) we are extracting one 4x4 memref.
+///                _______
+///               |. . . .|
+///               |. . . .|
+///               |. . . .|
+///               |. . . .|
+///               ---------
+/// After split we will extract four 2x2 memrefs.
+/// So, the corresponding offsets will be :-
+/// 1. Offset (0,0) - extract 2x2 memref
+///       ___
+///      |. .|. .
+///      |. .|. .
+///      -----
+///       . . . .
+///       . . . .
+/// 2. Offset (0,2) - extract 2x2 memref
+///           ___
+///       . .|. .|
+///       . .|. .|
+///          -----
+///       . . . .
+///       . . . .
+/// 3. Offset (2,0) - extract 2x2 memref
+///       . . . .
+///       . . . .
+///       ___
+///      |. .|. .
+///      |. .|. .
+///      -----
+/// 4. Offset (2,2) - extract 2x2 memref
+///       . . . .
+///       . . . .
+///           ___
+///       . .|. .|
+///       . .|. .|
+///          -----
+static FailureOr<OpFoldResult> updateL3SourceOffset(IRRewriter &rewriter,
+                                                    OpFoldResult oldL3Offset,
+                                                    int64_t offsetToAdd,
+                                                    MLIRContext *context) {
+  auto createAffineMap = [&](AffineExpr affineExpr,
+                             int64_t offsetToAdd) -> AffineMap {
+    AffineExpr newAffineExpr = affineExpr + offsetToAdd;
+    return AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0, {newAffineExpr},
+                          context);
+  };
+  OpFoldResult newL3AsSourceOffset;
+  OpBuilder::InsertionGuard guard(rewriter);
+  if (auto l3SourceOffsetAttr = dyn_cast<Attribute>(oldL3Offset)) {
+    int64_t l3SourceOffsetIntVal =
+        cast<IntegerAttr>(l3SourceOffsetAttr).getInt();
+    int64_t newOffset = l3SourceOffsetIntVal + offsetToAdd;
+    newL3AsSourceOffset = rewriter.getIndexAttr(newOffset);
+  } else {
+    auto l3SourceOffsetVal = cast<Value>(oldL3Offset);
+    if (auto blockArg = dyn_cast<BlockArgument>(l3SourceOffsetVal)) {
+      Operation *ownerOfBlockArg = blockArg.getOwner()->getParentOp();
+      rewriter.setInsertionPointToStart(blockArg.getOwner());
+      AffineExpr affineExpr = rewriter.getAffineDimExpr(0);
+      AffineMap newAffineMap = createAffineMap(affineExpr, offsetToAdd);
+      newL3AsSourceOffset =
+          rewriter
+              .create<affine::AffineApplyOp>(ownerOfBlockArg->getLoc(),
+                                             newAffineMap, l3SourceOffsetVal)
+              .getResult();
+    } else {
+      Operation *defOpOfL3SourceOffset = l3SourceOffsetVal.getDefiningOp();
+      Location loc = defOpOfL3SourceOffset->getLoc();
+      rewriter.setInsertionPoint(defOpOfL3SourceOffset);
+      if (auto applyOp =
+              dyn_cast<affine::AffineApplyOp>(defOpOfL3SourceOffset)) {
+        AffineExpr affineExpr = applyOp.getAffineMap().getResult(0);
+        AffineMap newAffineMap = createAffineMap(affineExpr, offsetToAdd);
+        newL3AsSourceOffset =
+            rewriter
+                .create<affine::AffineApplyOp>(loc, newAffineMap,
+                                               applyOp.getMapOperands())
+                .getResult();
+      } else if (auto constantOffset = getConstantIntValue(l3SourceOffsetVal)) {
+        int64_t newOffset = *constantOffset + offsetToAdd;
+        newL3AsSourceOffset = rewriter.getIndexAttr(newOffset);
+      } else {
+        return failure();
+      }
+    }
+  }
+  return newL3AsSourceOffset;
+}
+
+/// A struct utility to encapsulate all the data required to perform splitting
+/// of logicalobjectfifos.
+struct SplittingLogicalObjectFifoData {
+  SmallVector<AMDAIE::DmaCpyNdOp> l2ToL1DmaOps;
+  SmallVector<size_t> splitDimsForL2;
+  SmallVector<size_t> nonSplitDimsForL2;
+  AMDAIE::DmaCpyNdOp l3ToL2DmaOp;
+};
+
+/// Utility to check whether splitting of logicalobjectfifos can be performed.
+/// If possible, it populates the struct `SplittingLogicalObjectFifoData` with
+/// the data required to perform the actual splitting.
+static LogicalResult checkWhetherSplitIsPossible(
+    SplittingLogicalObjectFifoData &splittingLogicalObjectFifoData) {
+  SmallVector<AMDAIE::DmaCpyNdOp> l2ToL1DmaOps =
+      splittingLogicalObjectFifoData.l2ToL1DmaOps;
+
+  if (l2ToL1DmaOps.size() == 0) return failure();
+
+  SmallVector<OpFoldResult> baseSourceOffsets =
+      l2ToL1DmaOps[0].getSourceMixedOffsets();
+  LogicalObjectFifoFromMemrefOp sourceObjectFifo =
+      l2ToL1DmaOps[0].getSourceObjectFifo();
+  auto sourceAllocOp =
+      sourceObjectFifo.getMemref().getDefiningOp<memref::AllocOp>();
+  if (!sourceAllocOp) {
+    LLVM_DEBUG(llvm::dbgs() << "expected alloc op as the defining op of "
+                            << sourceObjectFifo << "\n");
+    return failure();
+  }
+
+  // We will now capture those dimensions where L2 memory was split. The way we
+  // do this is by checking all L2->L1 DmaOps' source offset and marking those
+  // dimensions which are not equal to at least one of the source offsets.
+  DenseSet<size_t> splitDimsSetForL2;
+  SmallVector<size_t> splitDimsForL2;
+  for (unsigned i = 1, n = l2ToL1DmaOps.size(); i < n; i++) {
+    if (l2ToL1DmaOps[i].getSourceObjectFifo() != sourceObjectFifo) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << l2ToL1DmaOps[i] << " does not have " << sourceObjectFifo
+                 << " as the source objectfifo\n");
+      return failure();
+    }
+    SmallVector<OpFoldResult> sourceOffsets =
+        l2ToL1DmaOps[i].getSourceMixedOffsets();
+    for (unsigned j = 0, m = baseSourceOffsets.size(); j < m; j++) {
+      if (baseSourceOffsets[j] != sourceOffsets[j] &&
+          !splitDimsSetForL2.contains(j)) {
+        splitDimsForL2.push_back(j);
+        splitDimsSetForL2.insert(j);
+      }
+    }
+  }
+  std::sort(splitDimsForL2.begin(), splitDimsForL2.end());
+
+  if (failed(checkIsRangeFromZero(splitDimsForL2))) {
+    LLVM_DEBUG(llvm::dbgs() << "cannot split L2 logicalobjectfifo because of "
+                               "non-contiguous split dimensions inferred\n");
+    return failure();
+  }
+
+  // Fetch the L3 -> L2 Dma Op corresponding to the L2 buffer as target.
+  SmallVector<AMDAIE::DmaCpyNdOp> l3ToL2DmaOps;
+  AMDAIE::DmaCpyNdOp l3ToL2DmaOp;
+  for (Operation *objFifoUserOp : sourceObjectFifo->getUsers()) {
+    if (auto dmaOp = dyn_cast<AMDAIE::DmaCpyNdOp>(objFifoUserOp);
+        dmaOp.getTargetObjectFifo() == sourceObjectFifo) {
+      l3ToL2DmaOps.push_back(dmaOp);
+    }
+  }
+  if (l3ToL2DmaOps.size() == 0) {
+    LLVM_DEBUG(llvm::dbgs() << "no corresponding L3->L2 dma op found for "
+                            << sourceObjectFifo << "\n");
+    return failure();
+  }
+  if (l3ToL2DmaOps.size() > 1) {
+    LLVM_DEBUG(llvm::dbgs() << "found more than one L3->L2 dma ops for "
+                            << sourceObjectFifo << "\n");
+    return failure();
+  }
+  l3ToL2DmaOp = l3ToL2DmaOps[0];
+  if ((l3ToL2DmaOp.getTargetMixedOffsets().size() !=
+       l3ToL2DmaOp.getSourceMixedOffsets().size()) ||
+      (l3ToL2DmaOp.getTargetMixedSizes().size() !=
+       l3ToL2DmaOp.getSourceMixedSizes().size()) ||
+      (l3ToL2DmaOp.getTargetMixedStrides().size() !=
+       l3ToL2DmaOp.getSourceMixedStrides().size())) {
+    LLVM_DEBUG(llvm::dbgs() << "dimensionality of source and target's "
+                               "offset/size/stride found different for "
+                            << l3ToL2DmaOp << "\n");
+    return failure();
+  }
+
+  SmallVector<OpFoldResult, 4> staticL2AsTargetSizes =
+      l3ToL2DmaOp.getTargetMixedSizes();
+  SmallVector<size_t> nonSplitDimsForL2(staticL2AsTargetSizes.size() -
+                                        splitDimsForL2.size());
+  std::iota(nonSplitDimsForL2.begin(), nonSplitDimsForL2.end(),
+            splitDimsForL2.size());
+
+  for (AMDAIE::DmaCpyNdOp l2ToL1DmaOp : l2ToL1DmaOps) {
+    SmallVector<OpFoldResult, 6> staticL2AsSourceOffsets =
+        l2ToL1DmaOp.getSourceMixedOffsets();
+    for (auto &&[splitDim, nonSplitdim] :
+         llvm::zip_equal(splitDimsForL2, nonSplitDimsForL2)) {
+      std::optional<int64_t> constantVal =
+          getConstantIntValue(staticL2AsSourceOffsets[splitDim]);
+      if (!constantVal) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "found a non-constant value for source offset at dim "
+                   << splitDim << " for " << l2ToL1DmaOp << "\n");
+        return failure();
+      }
+      constantVal = getConstantIntValue(staticL2AsTargetSizes[nonSplitdim]);
+      if (!constantVal) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "found a non-constant value for target size at dim "
+                   << nonSplitdim << " for " << l3ToL2DmaOp << "\n");
+        return failure();
+      }
+    }
+  }
+  splittingLogicalObjectFifoData.splitDimsForL2 = splitDimsForL2;
+  splittingLogicalObjectFifoData.nonSplitDimsForL2 = nonSplitDimsForL2;
+  splittingLogicalObjectFifoData.l3ToL2DmaOp = l3ToL2DmaOp;
+  return success();
+}
+
+// Given a vector of L2->L1 Dma ops' perform the splitting :-
+// 1. Check if the splitting can be performed or not. If not possible, bail out.
+// 2. For the split dimension inferred set offset = 0 and size as 1 for L2 and
+//    L3.
+// 3. Now traverse each L2->L1 Dma op and perform the following :-
+//    a) Create a new L2 AllocOp based on the updated size (step 3 above) and
+//       create a logicalobjectfifo using the same.
+//    b) Split L3->L2 Dma op.
+//    c) SPlit L2->L1 Dma op.
+// 4. Delete old L2->L1, L3->L2 and corresponding AllocOps.
+LogicalResult splitLogicalObjectFifos(
+    IRRewriter &rewriter, SmallVector<AMDAIE::DmaCpyNdOp> &l2ToL1DmaOps,
+    MLIRContext *context) {
+  SplittingLogicalObjectFifoData splittingLogicalObjectFifoData;
+  splittingLogicalObjectFifoData.l2ToL1DmaOps = l2ToL1DmaOps;
+  if (failed(checkWhetherSplitIsPossible(splittingLogicalObjectFifoData))) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Cannot perform splitting of logicalobjectfifos");
+    return success();
+  }
+  OpBuilder::InsertionGuard guard(rewriter);
+  SmallVector<size_t> splitDimsForL2 =
+      splittingLogicalObjectFifoData.splitDimsForL2;
+  SmallVector<size_t> nonSplitDimsForL2 =
+      splittingLogicalObjectFifoData.nonSplitDimsForL2;
+  AMDAIE::DmaCpyNdOp l3ToL2DmaOp = splittingLogicalObjectFifoData.l3ToL2DmaOp;
+
+  LogicalObjectFifoFromMemrefOp sourceObjectFifo =
+      l2ToL1DmaOps[0].getSourceObjectFifo();
+  auto sourceAllocOp =
+      sourceObjectFifo.getMemref().getDefiningOp<memref::AllocOp>();
+
+  DenseSet<Operation *> toBeErased;
+  toBeErased.insert(l3ToL2DmaOp);
+  toBeErased.insert(sourceAllocOp);
+  toBeErased.insert(sourceObjectFifo);
+
+  SmallVector<OpFoldResult, 4> staticL2AsTargetOffsets =
+      l3ToL2DmaOp.getTargetMixedOffsets();
+  SmallVector<OpFoldResult, 4> staticL2AsTargetSizes =
+      l3ToL2DmaOp.getTargetMixedSizes();
+  SmallVector<int64_t, 4> l2ShapeAsTarget = llvm::to_vector(
+      cast<MemRefType>(l3ToL2DmaOp.getTargetObjectFifo().getMemref().getType())
+          .getShape());
+  SmallVector<OpFoldResult, 4> staticL3AsSourceOffsets =
+      l3ToL2DmaOp.getSourceMixedOffsets();
+  SmallVector<OpFoldResult, 4> staticL3AsSourceSizes =
+      l3ToL2DmaOp.getSourceMixedSizes();
+  OpFoldResult zeroVal = getAsIndexOpFoldResult(context, 0);
+  OpFoldResult oneVal = getAsIndexOpFoldResult(context, 1);
+  // Update split dimensions' offset/size for L2 as target and L3 as source. We
+  // can afford to do this here because it's going to be the same for all L3->L2
+  // splits. Here we are setting offset = 0 and size = 1.
+  for (size_t dim : splitDimsForL2) {
+    staticL2AsTargetOffsets[dim] = zeroVal;
+    staticL2AsTargetSizes[dim] = oneVal;
+    staticL3AsSourceOffsets[dim] = zeroVal;
+    staticL3AsSourceSizes[dim] = oneVal;
+    l2ShapeAsTarget[dim] = 1;
+  }
+
+  // Traverse each L2->L1 DmaCpyNd op and split them.
+  for (AMDAIE::DmaCpyNdOp l2ToL1DmaOp : l2ToL1DmaOps) {
+    SmallVector<OpFoldResult, 6> staticL2AsSourceOffsets =
+        l2ToL1DmaOp.getSourceMixedOffsets();
+    SmallVector<OpFoldResult, 6> staticL2AsSourceSizes =
+        l2ToL1DmaOp.getSourceMixedSizes();
+
+    // Now we'll create a new L2 buffer based on the new shape inferred earlier
+    // via `l2ShapeAsTarget`.
+    rewriter.setInsertionPoint(sourceAllocOp);
+    LogicalObjectFifoFromMemrefOp targetObjectFifo =
+        l2ToL1DmaOp.getTargetObjectFifo();
+    Value targetAllocOp = targetObjectFifo.getMemref();
+    auto oldSourceMemRefType = cast<MemRefType>(sourceAllocOp.getType());
+    auto targetMemRefType = cast<MemRefType>(targetAllocOp.getType());
+    MemRefType newAllocType = MemRefType::get(
+        l2ShapeAsTarget, targetMemRefType.getElementType(),
+        MemRefLayoutAttrInterface{}, oldSourceMemRefType.getMemorySpace());
+    auto newAllocOp = rewriter.create<memref::AllocOp>(rewriter.getUnknownLoc(),
+                                                       newAllocType);
+    auto newDeallocOp = rewriter.create<memref::DeallocOp>(
+        rewriter.getUnknownLoc(), newAllocOp);
+    newDeallocOp->moveBefore(&newAllocOp->getBlock()->back());
+    auto type = cast<MemRefType>(newAllocOp.getType());
+    // Create new logicalobjectfifo.from_memref for the newly created L2 buffer.
+    rewriter.setInsertionPoint(l2ToL1DmaOp.getSourceObjectFifo());
+    auto source = rewriter.create<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+        rewriter.getUnknownLoc(), LogicalObjectFifoType::get(type),
+        newAllocOp.getResult(), sourceObjectFifo.getTiles());
+
+    // --------------------------------------------
+    // ---------- L3 -> L2 splitting --------------
+    // --------------------------------------------
+    // Update L3 source offsets for non-split dimensions. Refer doc comment of
+    // `updateL3SourceOffset` for the computation rationale involved.
+    SmallVector<OpFoldResult, 4> staticL3AsSourceOffsets =
+        l3ToL2DmaOp.getSourceMixedOffsets();
+    for (auto &&[splitDim, nonSplitdim] :
+         llvm::zip_equal(splitDimsForL2, nonSplitDimsForL2)) {
+      std::optional<int64_t> constantOffset =
+          getConstantIntValue(staticL2AsSourceOffsets[splitDim]);
+      if (!constantOffset) {
+        return l2ToL1DmaOp->emitOpError()
+               << "found a non-constant value for source offset at dim "
+               << splitDim;
+      }
+      std::optional<int64_t> constantSize =
+          getConstantIntValue(staticL2AsTargetSizes[nonSplitdim]);
+      if (!constantSize) {
+        return l3ToL2DmaOp->emitOpError()
+               << "found a non-constant value for target size at dim "
+               << nonSplitdim;
+      }
+      int64_t offsetToAdd = constantOffset.value() * constantSize.value();
+      FailureOr<OpFoldResult> newOffset = updateL3SourceOffset(
+          rewriter, staticL3AsSourceOffsets[nonSplitdim], offsetToAdd, context);
+      if (failed(newOffset)) {
+        // TODO: Ideally we should be able to handle even +, -, *, /, etc.
+        //       But handle this later (if at all!) as such cases might not
+        //       arise.
+        return l3ToL2DmaOp->emitOpError()
+               << "Unhandled expression for source offset at dim "
+               << nonSplitdim;
+      }
+      staticL3AsSourceOffsets[nonSplitdim] = *newOffset;
+    }
+    // Create new L3 -> L2 Dma Op.
+    rewriter.setInsertionPoint(l3ToL2DmaOp);
+    rewriter.create<AMDAIE::DmaCpyNdOp>(
+        l3ToL2DmaOp.getLoc(), source, llvm::ArrayRef(staticL2AsTargetOffsets),
+        llvm::ArrayRef(staticL2AsTargetSizes),
+        l3ToL2DmaOp.getTargetMixedStrides(), l3ToL2DmaOp.getSource(),
+        llvm::ArrayRef(staticL3AsSourceOffsets),
+        llvm::ArrayRef(staticL3AsSourceSizes),
+        l3ToL2DmaOp.getSourceMixedStrides());
+
+    // --------------------------------------------
+    // ---------- L2 -> L1 splitting --------------
+    // --------------------------------------------
+    // Update split dimensions' offset/size for L2 as target . Here we are
+    // setting offset = 0 and size = 1.
+    for (unsigned dim : splitDimsForL2) {
+      staticL2AsSourceOffsets[dim] = zeroVal;
+      staticL2AsSourceSizes[dim] = oneVal;
+    }
+
+    // Create new L2 -> L1 Input DmaOp.
+    rewriter.setInsertionPoint(l2ToL1DmaOp);
+    auto newL2ToL1DmaOp = rewriter.create<AMDAIE::DmaCpyNdOp>(
+        l2ToL1DmaOp.getLoc(), l2ToL1DmaOp.getTarget(),
+        l2ToL1DmaOp.getTargetMixedOffsets(), l2ToL1DmaOp.getTargetMixedSizes(),
+        l2ToL1DmaOp.getTargetMixedStrides(), source,
+        llvm::ArrayRef(staticL2AsSourceOffsets),
+        llvm::ArrayRef(staticL2AsSourceSizes),
+        l2ToL1DmaOp.getSourceMixedStrides());
+    rewriter.replaceOp(l2ToL1DmaOp, newL2ToL1DmaOp);
+
+    // Remove old dealloc.
+    memref::DeallocOp oldDeallocOp;
+    for (Operation *userOp : sourceAllocOp->getUsers()) {
+      if (auto deallocUser = dyn_cast<memref::DeallocOp>(userOp))
+        oldDeallocOp = deallocUser;
+    }
+    if (oldDeallocOp) toBeErased.insert(oldDeallocOp);
+  }
+
+  for (Operation *op : toBeErased) {
+    op->dropAllUses();
+    rewriter.eraseOp(op);
+  }
+  
+  return success();
+}
+
+}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h
new file mode 100644
index 000000000..919004949
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h
@@ -0,0 +1,23 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_AMD_AIE_TRANSFORMS_AMDAIELOGICALOBJFIFOSPLITTINGUTILS_H_
+#define IREE_AMD_AIE_TRANSFORMS_AMDAIELOGICALOBJFIFOSPLITTINGUTILS_H_
+
+#include "iree-amd-aie/IR/AMDAIEOps.h"
+
+namespace mlir::iree_compiler::AMDAIE {
+
+/// Utility to split logicalobjectfifos given a struct
+/// `SplittingLogicalObjectFifoData` which contains all the required data to
+/// perform the splitting.
+LogicalResult splitLogicalObjectFifos(
+    IRRewriter &rewriter, SmallVector<AMDAIE::DmaCpyNdOp> &l2ToL1DmaOps,
+    MLIRContext *context);
+
+}  // namespace mlir::iree_compiler::AMDAIE
+
+#endif
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp
new file mode 100644
index 000000000..e6736a7c9
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp
@@ -0,0 +1,71 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "iree-amd-aie/IR/AMDAIEOps.h"
+#include "iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h"
+#include "iree-amd-aie/Transforms/Passes.h"
+// #include "llvm/Support/Debug.h"
+#include "mlir/IR/Iterators.h"
+#include "mlir/Pass/Pass.h"
+
+#define DEBUG_TYPE "iree-amdaie-split-logical-objectfifos-for-connection-reuse"
+
+namespace mlir::iree_compiler::AMDAIE {
+
+namespace {
+
+/// Utility to help fetch those input DmaCpyNd Ops which needs to be split.
+static SmallVector<AMDAIE::DmaCpyNdOp> fetchDmaCpyNdOpsToSplit(
+    ModuleOp moduleOp) {
+  SmallVector<AMDAIE::DmaCpyNdOp> l2ToL1DmaOps;
+  // We are currently walking through CoreOps gathering 3rd Input DmaOp (if
+  // applicable) from them.
+  // TODO(avarma): We will generalize this later.
+  moduleOp.walk([&](AMDAIE::CoreOp coreOp) {
+    SmallVector<Value> inputDmas = coreOp.getInputDmas();
+    if (inputDmas.size() != 3) return WalkResult::skip();
+    auto dmaCpyNdOp = inputDmas[2].getDefiningOp<AMDAIE::DmaCpyNdOp>();
+    assert(dmaCpyNdOp && "expected an amdaie.dma_cpy_nd op");
+    l2ToL1DmaOps.push_back(dmaCpyNdOp);
+    return WalkResult::advance();
+  });
+  return l2ToL1DmaOps;
+}
+
+class AMDAIESplitLogicalObjFifosForConnectionReusePass
+    : public impl::AMDAIESplitLogicalObjFifosForConnectionReuseBase<
+          AMDAIESplitLogicalObjFifosForConnectionReusePass> {
+ public:
+  using AMDAIESplitLogicalObjFifosForConnectionReuseBase::
+      AMDAIESplitLogicalObjFifosForConnectionReuseBase;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AMDAIEDialect>();
+  }
+  void runOnOperation() override;
+};
+
+void AMDAIESplitLogicalObjFifosForConnectionReusePass::runOnOperation() {
+  ModuleOp moduleOp = getOperation();
+  MLIRContext *context = &getContext();
+  IRRewriter rewriter(context);
+
+  SmallVector<AMDAIE::DmaCpyNdOp> l2ToL1DmaOps =
+      fetchDmaCpyNdOpsToSplit(moduleOp);
+
+  if (failed(splitLogicalObjectFifos(rewriter, l2ToL1DmaOps, context))) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Failed to perform splitting of logicalobjectfifos");
+    return signalPassFailure();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<Pass> createAMDAIESplitLogicalObjFifosForConnectionReusePass() {
+  return std::make_unique<AMDAIESplitLogicalObjFifosForConnectionReusePass>();
+}
+
+}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
index d2f21f7d5..673763df4 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
@@ -74,6 +74,7 @@ iree_cc_library(
     "AMDAIEInsertLoopsForVectorization.cpp"
     "AMDAIELinkExecutables.cpp"
     "AMDAIELocalizeLogicalObjectFifo.cpp"
+    "AMDAIELogicalObjFifoSplittingUtils.cpp"
     "AMDAIELowerExecutableTarget.cpp"
     "AMDAIELowerFuncArgs.cpp"
     "AMDAIELowerToAIE.cpp"
@@ -88,6 +89,7 @@ iree_cc_library(
     "AMDAIEPeelForLoop.cpp"
     "AMDAIEPropagateDataLayout.cpp"
     "AMDAIESinkIntoCore.cpp"
+    "AMDAIESplitLogicalObjFifosForConnectionReuse.cpp"
     "AMDAIETile.cpp"
     "AMDAIETileAndFuse.cpp"
     "AMDAIEUtils.cpp"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
index 9ef92c268..02918677e 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
@@ -68,6 +68,7 @@ namespace mlir::iree_compiler::AMDAIE {
 #define GEN_PASS_DEF_AMDAIEPEELFORLOOP
 #define GEN_PASS_DEF_AMDAIEPROPAGATEDATALAYOUT
 #define GEN_PASS_DEF_AMDAIESINKINTOCORE
+#define GEN_PASS_DEF_AMDAIESPLITLOGICALOBJFIFOSFORCONNECTIONREUSE
 #define GEN_PASS_DEF_AMDAIETILE
 #define GEN_PASS_DEF_AMDAIETILEANDFUSE
 #define GEN_PASS_DEF_AMDAIEVECTORIZATION
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
index 90b1bcfd8..729da7457 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -594,6 +594,7 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) {
   passManager.addPass(createAMDAIEDistributeCoresAndObjectFifosPass());
   passManager.addPass(createCSEPass());
   passManager.addPass(createCanonicalizerPass());
+  passManager.addPass(createAMDAIESplitLogicalObjFifosForConnectionReusePass());
 
   passManager.addPass(createAMDAIEDmaToCircularDmaPass());
   passManager.addNestedPass<func::FuncOp>(createAMDAIECreateAIEWorkgroupPass());
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
index 0a901bd5b..2632ee8fb 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
@@ -224,6 +224,9 @@ std::unique_ptr<Pass> createAMDAIEPeelForLoopPass(
 /// Create a pass to sink all dependencies into `amdaie.core` operations.
 std::unique_ptr<Pass> createAMDAIESinkIntoCorePass();
 
+/// Create a pass to split logicalobjectfifos for connection reuse.
+std::unique_ptr<Pass> createAMDAIESplitLogicalObjFifosForConnectionReusePass();
+
 /// Create pass to tile TilingInterface operations.
 std::unique_ptr<Pass> createAMDAIETilePass(AMDAIETileOptions options = {});
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
index 3da414445..fdc38eb12 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
@@ -470,6 +470,12 @@ def AMDAIESinkIntoCore :
   let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESinkIntoCorePass()";
 }
 
+def AMDAIESplitLogicalObjFifosForConnectionReuse :
+  Pass<"iree-amdaie-split-logical-objectfifos-for-connection-reuse", "ModuleOp"> {
+  let summary = "Pass to split L2 buffers to share inputs of Matmul and Elementwise operations.";
+  let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESplitLogicalObjFifosForConnectionReusePass()";
+}
+
 def AMDAIETile :
     InterfacePass<"iree-amdaie-tile", "mlir::FunctionOpInterface"> {
   let summary = "Pass to tile TilingInterface operations.";
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
index 319166c69..b09d08f37 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
@@ -60,6 +60,7 @@ iree_lit_test_suite(
     "peel_for_loop.mlir"
     "propagate_data_layout.mlir"
     "sink_into_core.mlir"
+    "split_logicalobjfifos_for_connection_reuse.mlir"
     "tile_and_fuse_using_scf_for.mlir"
     "tile_and_fuse_using_scf_forall.mlir"
     "tile_copy_using_scf_for.mlir"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos_for_connection_reuse.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos_for_connection_reuse.mlir
new file mode 100644
index 000000000..2e716dddd
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos_for_connection_reuse.mlir
@@ -0,0 +1,1398 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s
+
+// Glossary:
+// candidate core op : they are those amdaie.core ops which have at least three input dma ops.
+// non-candidate core op : they are those amdaie.core ops which have less than three input dma ops.
+
+// Test non-candidate core op.
+
+// CHECK-LABEL: @split_l2_buffer_no_candidate_core_op
+//   CHECK-NOT:     memref<1x1x32x32xi32, 1 : i32>
+#map = affine_map<(d0) -> (d0 * 64)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>
+#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>
+#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
+module {
+  func.func @split_l2_buffer_no_candidate_core_op(%arg0: !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, %arg2: !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, %arg3: !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>) {
+    %c3 = arith.constant 3 : index
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %c0_i32 = arith.constant 0 : i32
+    %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
+    %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
+    %alloc_2 = memref.alloc() : memref<128x128xi32>
+    %tile = amdaie.tile(%c1, %c3)
+    %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
+    %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+    scf.forall (%arg4, %arg5) in (2, 2) {
+      %2 = affine.apply #map(%arg5)
+      %3 = affine.apply #map(%arg4)
+      %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %tile_4 = amdaie.tile(%c1, %c3)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+      %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %12 = amdaie.core(%tile_4, in : [%7], out : []) {
+        %13 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        linalg.fill ins(%c0_i32 : i32) outs(%13 : memref<1x1x4x8x4x8xi32, 2 : i32>)
+        amdaie.end
+      }
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_2 : memref<128x128xi32>
+    return
+  }
+}
+
+// -----
+
+// Test two candidate core ops.
+
+//   CHECK-DAG: #map = affine_map<(d0) -> (d0 * 64)>
+//   CHECK-DAG: #map1 = affine_map<(d0) -> (d0 * 64 + 32)>
+//       CHECK: @split_l2_buffer_two_core_ops
+//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
+//   CHECK-DAG:   %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32>
+//   CHECK-DAG:   %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+//   CHECK-DAG:   %[[TILE_0:.*]] = amdaie.tile(%[[C1]], %[[C3]])
+//   CHECK-DAG:   %[[TILE_1:.*]] = amdaie.tile(%[[C0]], %[[C2]])
+//       CHECK:   %[[L2_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_0]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+//       CHECK:   %[[L2_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_1]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+//       CHECK:   %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+//       CHECK:   scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2)
+//   CHECK-DAG:       %[[IV1_0:.*]] = affine.apply #map(%[[IV1]])
+//   CHECK-DAG:       %[[IV1_32:.*]] = affine.apply #map1(%[[IV1]])
+//   CHECK-DAG:       %[[IV0_0:.*]] = affine.apply #map(%[[IV0]])
+//   CHECK-DAG:       %[[IV0_32:.*]] = affine.apply #map1(%[[IV0]])
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, %[[IV0_0:.*]], %[[IV1_0:.*]]] [1, 1, 32, 32] [4096, 32, 128, 1]
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32:.*]], %[[IV1_32:.*]]] [1, 1, 32, 32] [4096, 32, 128, 1]
+//       CHECK:       amdaie.logicalobjectfifo.from_memref
+//       CHECK:       amdaie.logicalobjectfifo.from_memref
+//       CHECK:       amdaie.dma_cpy_nd
+//       CHECK:       amdaie.dma_cpy_nd
+//       CHECK:       %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_0]]}
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:       }
+//       CHECK:       %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]}
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1]
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:       }
+//       CHECK:   memref.dealloc %[[L2_ALLOC_0]] : memref<1x1x32x32xi32, 1 : i32>
+//       CHECK:   memref.dealloc %[[L2_ALLOC_1]] : memref<1x1x32x32xi32, 1 : i32>
+#map = affine_map<(d0) -> (d0 * 64)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>
+#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>
+#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
+module {
+  func.func @split_l2_buffer_two_core_ops(%arg0: !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, %arg2: !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, %arg3: !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>) {
+    %c3 = arith.constant 3 : index
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
+    %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+    %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
+    %alloc_2 = memref.alloc() : memref<128x128xi32>
+    %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+    %tile = amdaie.tile(%c1, %c3)
+    %tile_4 = amdaie.tile(%c0, %c2)
+    %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
+    %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+    scf.forall (%arg4, %arg5) in (2, 2) {
+      %2 = affine.apply #map(%arg5)
+      %3 = affine.apply #map(%arg4)
+      %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %tile_5 = amdaie.tile(%c1, %c3)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+      %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
+      %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
+      %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %11 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %12 = amdaie.core(%tile_5, in : [%7, %8, %10], out : [%11]) {
+        %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.muli %in, %in_6 : i32
+          %22 = arith.addi %out, %21 : i32
+          linalg.yield %22 : i32
+        }
+        %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.addi %in, %in_6 : i32
+          linalg.yield %21 : i32
+        }
+        amdaie.end
+      }
+      %13 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %14 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %15 = amdaie.core(%tile_4, in : [%7, %8, %14], out : [%11]) {
+        %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.muli %in, %in_6 : i32
+          %22 = arith.addi %out, %21 : i32
+          linalg.yield %22 : i32
+        }
+        %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.addi %in, %in_6 : i32
+          linalg.yield %21 : i32
+        }
+        amdaie.end
+      }
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32>
+    memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_2 : memref<128x128xi32>
+    return
+  }
+}
+
+// -----
+
+// Test mix of candidate and non-candidate core ops.
+
+//   CHECK-DAG: #map = affine_map<(d0) -> (d0 * 64)>
+//   CHECK-DAG: #map1 = affine_map<(d0) -> (d0 * 64 + 32)>
+//       CHECK: @split_l2_buffer_mixed_core_ops
+//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
+//   CHECK-DAG:   %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32>
+//   CHECK-DAG:   %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   %[[L2_ALLOC_2:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   %[[L2_ALLOC_3:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-NOT:   memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-NOT:   memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+//   CHECK-DAG:   %[[TILE_0:.*]] = amdaie.tile(%[[C1]], %[[C3]])
+//   CHECK-DAG:   %[[TILE_1:.*]] = amdaie.tile(%[[C0]], %[[C2]])
+//   CHECK-DAG:   %[[TILE_2:.*]] = amdaie.tile(%[[C1]], %[[C2]])
+//   CHECK-DAG:   %[[TILE_3:.*]] = amdaie.tile(%[[C0]], %[[C3]])
+//       CHECK:   %[[L2_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_0]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+//       CHECK:   %[[L2_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_1]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+//       CHECK:   %[[L2_OBJECTFIFO_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_2]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+//       CHECK:   %[[L2_OBJECTFIFO_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_3]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+//       CHECK:   %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+//       CHECK:   scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2)
+//   CHECK-DAG:       %[[IV1_0:.*]] = affine.apply #map(%[[IV1]])
+//   CHECK-DAG:       %[[IV1_32:.*]] = affine.apply #map1(%[[IV1]])
+//   CHECK-DAG:       %[[IV0_0:.*]] = affine.apply #map(%[[IV0]])
+//   CHECK-DAG:       %[[IV0_32:.*]] = affine.apply #map1(%[[IV0]])
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, %[[IV0_0]], %[[IV1_0]]] [1, 1, 32, 32] [4096, 32, 128, 1]
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, %[[IV0_0]], %[[IV1_32]]] [1, 1, 32, 32] [4096, 32, 128, 1]
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_2:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_2]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32]], %[[IV1_0]]] [1, 1, 32, 32] [4096, 32, 128, 1]
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_3:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_3]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32]], %[[IV1_32]]] [1, 1, 32, 32] [4096, 32, 128, 1]
+//       CHECK:       amdaie.logicalobjectfifo.from_memref
+//       CHECK:       amdaie.logicalobjectfifo.from_memref
+//       CHECK:       amdaie.dma_cpy_nd
+//       CHECK:       amdaie.dma_cpy_nd
+//       CHECK:       amdaie.core(%[[TILE_0]]
+//       CHECK:         linalg.fill
+//       CHECK:         amdaie.end
+//       CHECK:       }
+//       CHECK:       %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]}
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:         amdaie.end
+//       CHECK:       }
+//       CHECK:       amdaie.core(%[[TILE_2]]
+//       CHECK:         linalg.fill
+//       CHECK:         amdaie.end
+//       CHECK:       }
+//       CHECK:       %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_0]]}
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:         amdaie.end
+//       CHECK:       }
+//       CHECK:       %[[L1_OBJECTFIFO_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_2]]}
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_2:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_2]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L2_OBJECTFIFO_2]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_2]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_2]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:         amdaie.end
+//       CHECK:       }
+//       CHECK:       %[[L1_OBJECTFIFO_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_3]]}
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_3:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_3]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L2_OBJECTFIFO_3]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_3]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_3]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:         amdaie.end
+//       CHECK:       }
+//   CHECK-DAG:   memref.dealloc %[[L2_ALLOC_0]] : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   memref.dealloc %[[L2_ALLOC_1]] : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   memref.dealloc %[[L2_ALLOC_2]] : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   memref.dealloc %[[L2_ALLOC_3]] : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-NOT:   memref.dealloc %{{.*}} : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-NOT:   memref.dealloc %{{.*}} : memref<1x1x32x32xi32, 1 : i32>
+#map = affine_map<(d0) -> (d0 * 64)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>
+#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>
+#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
+module {
+  func.func @split_l2_buffer_mixed_core_ops(%arg0: !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, %arg2: !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, %arg3: !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>) {
+    %c3 = arith.constant 3 : index
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %c0_i32 = arith.constant 0 : i32
+    %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
+    %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+    %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
+    %alloc_2 = memref.alloc() : memref<128x128xi32>
+    %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+    %tile = amdaie.tile(%c1, %c3)
+    %tile_4 = amdaie.tile(%c0, %c2)
+    %tile_5 = amdaie.tile(%c1, %c2)
+    %tile_6 = amdaie.tile(%c0, %c3)
+    %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
+    %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+    scf.forall (%arg4, %arg5) in (2, 2) {
+      %2 = affine.apply #map(%arg5)
+      %3 = affine.apply #map(%arg4)
+      %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %tile_7 = amdaie.tile(%c1, %c3)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+      %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
+      %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
+      %9 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %10 = amdaie.core(%tile_7, in : [%7], out : []) {
+        %24 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        linalg.fill ins(%c0_i32 : i32) outs(%24 : memref<1x1x4x8x4x8xi32, 2 : i32>)
+        amdaie.end
+      }
+      %11 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %12 = amdaie.dma_cpy_nd(%11[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %13 = amdaie.core(%tile_4, in : [%7, %8, %12], out : [%9]) {
+        %24 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %25 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %26 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%24, %25 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%26 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_8: i32, %out: i32):
+          %29 = arith.muli %in, %in_8 : i32
+          %30 = arith.addi %out, %29 : i32
+          linalg.yield %30 : i32
+        }
+        %27 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %28 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%26, %27 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%28 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_8: i32, %out: i32):
+          %29 = arith.addi %in, %in_8 : i32
+          linalg.yield %29 : i32
+        }
+        amdaie.end
+      }
+      %14 = amdaie.core(%tile_5, in : [%7], out : []) {
+        %24 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        linalg.fill ins(%c0_i32 : i32) outs(%24 : memref<1x1x4x8x4x8xi32, 2 : i32>)
+        amdaie.end
+      }
+      %15 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %16 = amdaie.dma_cpy_nd(%15[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %17 = amdaie.core(%tile, in : [%7, %8, %16], out : [%9]) {
+        %24 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %25 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %26 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%24, %25 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%26 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_8: i32, %out: i32):
+          %29 = arith.muli %in, %in_8 : i32
+          %30 = arith.addi %out, %29 : i32
+          linalg.yield %30 : i32
+        }
+        %27 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %28 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%26, %27 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%28 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_8: i32, %out: i32):
+          %29 = arith.addi %in, %in_8 : i32
+          linalg.yield %29 : i32
+        }
+        amdaie.end
+      }
+      %18 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_5} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %19 = amdaie.dma_cpy_nd(%18[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %20 = amdaie.core(%tile_5, in : [%7, %8, %19], out : [%9]) {
+        %24 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %25 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %26 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%24, %25 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%26 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_8: i32, %out: i32):
+          %29 = arith.muli %in, %in_8 : i32
+          %30 = arith.addi %out, %29 : i32
+          linalg.yield %30 : i32
+        }
+        %27 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %28 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%26, %27 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%28 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_8: i32, %out: i32):
+          %29 = arith.addi %in, %in_8 : i32
+          linalg.yield %29 : i32
+        }
+        amdaie.end
+      }
+      %21 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_6} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %22 = amdaie.dma_cpy_nd(%21[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %23 = amdaie.core(%tile_6, in : [%7, %8, %22], out : [%9]) {
+        %24 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %25 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %26 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%24, %25 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%26 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_8: i32, %out: i32):
+          %29 = arith.muli %in, %in_8 : i32
+          %30 = arith.addi %out, %29 : i32
+          linalg.yield %30 : i32
+        }
+        %27 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %28 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%26, %27 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%28 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_8: i32, %out: i32):
+          %29 = arith.addi %in, %in_8 : i32
+          linalg.yield %29 : i32
+        }
+        amdaie.end
+      }
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32>
+    memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_2 : memref<128x128xi32>
+    return
+  }
+}
+
+// -----
+
+// Test splitting buffer logic even if we don't have affine.apply ops and have mere constants
+// in L3 source's offset.
+
+//       CHECK: @split_l2_buffer_mixed_core_ops_l3_source_as_constant
+//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
+//   CHECK-DAG:   %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32>
+//   CHECK-DAG:   %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   %[[L2_ALLOC_2:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   %[[L2_ALLOC_3:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-NOT:   memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-NOT:   memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+//   CHECK-DAG:   %[[TILE_0:.*]] = amdaie.tile(%[[C1]], %[[C3]])
+//   CHECK-DAG:   %[[TILE_1:.*]] = amdaie.tile(%[[C0]], %[[C2]])
+//   CHECK-DAG:   %[[TILE_2:.*]] = amdaie.tile(%[[C1]], %[[C2]])
+//   CHECK-DAG:   %[[TILE_3:.*]] = amdaie.tile(%[[C0]], %[[C3]])
+//       CHECK:   %[[L2_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_0]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+//       CHECK:   %[[L2_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_1]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+//       CHECK:   %[[L2_OBJECTFIFO_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_2]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+//       CHECK:   %[[L2_OBJECTFIFO_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_3]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+//       CHECK:   %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+//       CHECK:   scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2)
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, 3, 2] [1, 1, 32, 32] [4096, 32, 128, 1]
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, 3, 34] [1, 1, 32, 32] [4096, 32, 128, 1]
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_2:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_2]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, 35, 2] [1, 1, 32, 32] [4096, 32, 128, 1]
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_3:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_3]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, 35, 34] [1, 1, 32, 32] [4096, 32, 128, 1]
+//       CHECK:       amdaie.logicalobjectfifo.from_memref
+//       CHECK:       amdaie.logicalobjectfifo.from_memref
+//       CHECK:       amdaie.dma_cpy_nd
+//       CHECK:       amdaie.dma_cpy_nd
+//       CHECK:       amdaie.core(%[[TILE_0]]
+//       CHECK:         linalg.fill
+//       CHECK:         amdaie.end
+//       CHECK:       }
+//       CHECK:       %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]}
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:         amdaie.end
+//       CHECK:       }
+//       CHECK:       amdaie.core(%[[TILE_2]]
+//       CHECK:         linalg.fill
+//       CHECK:         amdaie.end
+//       CHECK:       }
+//       CHECK:       %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_0]]}
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:         amdaie.end
+//       CHECK:       }
+//       CHECK:       %[[L1_OBJECTFIFO_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_2]]}
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_2:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_2]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L2_OBJECTFIFO_2]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_2]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_2]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:         amdaie.end
+//       CHECK:       }
+//       CHECK:       %[[L1_OBJECTFIFO_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_3]]}
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_3:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_3]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L2_OBJECTFIFO_3]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_3]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_3]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:         amdaie.end
+//       CHECK:       }
+//   CHECK-DAG:   memref.dealloc %[[L2_ALLOC_0]] : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   memref.dealloc %[[L2_ALLOC_1]] : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   memref.dealloc %[[L2_ALLOC_2]] : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   memref.dealloc %[[L2_ALLOC_3]] : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-NOT:   memref.dealloc %{{.*}} : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-NOT:   memref.dealloc %{{.*}} : memref<1x1x32x32xi32, 1 : i32>
+#map = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>
+#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
+module {
+  func.func @split_l2_buffer_mixed_core_ops_l3_source_as_constant(%arg0: !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, %arg2: !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, %arg3: !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>) {
+    %c3 = arith.constant 3 : index
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %c0_i32 = arith.constant 0 : i32
+    %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
+    %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+    %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
+    %alloc_2 = memref.alloc() : memref<128x128xi32>
+    %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+    %tile = amdaie.tile(%c1, %c3)
+    %tile_4 = amdaie.tile(%c0, %c2)
+    %tile_5 = amdaie.tile(%c1, %c2)
+    %tile_6 = amdaie.tile(%c0, %c3)
+    %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
+    %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+    scf.forall (%arg4, %arg5) in (2, 2) {
+      %2 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %c3, %c2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %tile_7 = amdaie.tile(%c1, %c3)
+      %3 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+      %4 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
+      %5 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %3[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %6 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %4[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
+      %7 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %8 = amdaie.core(%tile_7, in : [%5], out : []) {
+        %22 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        linalg.fill ins(%c0_i32 : i32) outs(%22 : memref<1x1x4x8x4x8xi32, 2 : i32>)
+        amdaie.end
+      }
+      %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %11 = amdaie.core(%tile_4, in : [%5, %6, %10], out : [%7]) {
+        %22 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %23 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %24 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%22, %23 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%24 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_8: i32, %out: i32):
+          %27 = arith.muli %in, %in_8 : i32
+          %28 = arith.addi %out, %27 : i32
+          linalg.yield %28 : i32
+        }
+        %25 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %26 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%24, %25 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%26 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_8: i32, %out: i32):
+          %27 = arith.addi %in, %in_8 : i32
+          linalg.yield %27 : i32
+        }
+        amdaie.end
+      }
+      %12 = amdaie.core(%tile_5, in : [%5], out : []) {
+        %22 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        linalg.fill ins(%c0_i32 : i32) outs(%22 : memref<1x1x4x8x4x8xi32, 2 : i32>)
+        amdaie.end
+      }
+      %13 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %14 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %15 = amdaie.core(%tile, in : [%5, %6, %14], out : [%7]) {
+        %22 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %23 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %24 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%22, %23 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%24 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_8: i32, %out: i32):
+          %27 = arith.muli %in, %in_8 : i32
+          %28 = arith.addi %out, %27 : i32
+          linalg.yield %28 : i32
+        }
+        %25 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %26 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%24, %25 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%26 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_8: i32, %out: i32):
+          %27 = arith.addi %in, %in_8 : i32
+          linalg.yield %27 : i32
+        }
+        amdaie.end
+      }
+      %16 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_5} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %17 = amdaie.dma_cpy_nd(%16[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %18 = amdaie.core(%tile_5, in : [%5, %6, %17], out : [%7]) {
+        %22 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %23 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %24 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%22, %23 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%24 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_8: i32, %out: i32):
+          %27 = arith.muli %in, %in_8 : i32
+          %28 = arith.addi %out, %27 : i32
+          linalg.yield %28 : i32
+        }
+        %25 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %26 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%24, %25 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%26 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_8: i32, %out: i32):
+          %27 = arith.addi %in, %in_8 : i32
+          linalg.yield %27 : i32
+        }
+        amdaie.end
+      }
+      %19 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_6} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %20 = amdaie.dma_cpy_nd(%19[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %21 = amdaie.core(%tile_6, in : [%5, %6, %20], out : [%7]) {
+        %22 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %23 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %24 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%22, %23 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%24 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_8: i32, %out: i32):
+          %27 = arith.muli %in, %in_8 : i32
+          %28 = arith.addi %out, %27 : i32
+          linalg.yield %28 : i32
+        }
+        %25 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %26 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%24, %25 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%26 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_8: i32, %out: i32):
+          %27 = arith.addi %in, %in_8 : i32
+          linalg.yield %27 : i32
+        }
+        amdaie.end
+      }
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32>
+    memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_2 : memref<128x128xi32>
+    return
+  }
+}
+
+// -----
+
+// Test splitting of logicalobjectfifo when offset of L3 source is an attribute.
+
+//       CHECK: @attribute_offset_for_l3_source
+//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
+//   CHECK-DAG:   %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32>
+//   CHECK-DAG:   %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+//   CHECK-DAG:   %[[TILE_0:.*]] = amdaie.tile(%[[C1]], %[[C3]])
+//   CHECK-DAG:   %[[TILE_1:.*]] = amdaie.tile(%[[C0]], %[[C2]])
+//       CHECK:   %[[L2_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_0]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+//       CHECK:   %[[L2_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_1]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+//       CHECK:   %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+//       CHECK:   scf.forall
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, 3, 2] [1, 1, 32, 32] [4096, 32, 128, 1]
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, 35, 34] [1, 1, 32, 32] [4096, 32, 128, 1]
+//       CHECK:       amdaie.logicalobjectfifo.from_memref
+//       CHECK:       amdaie.logicalobjectfifo.from_memref
+//       CHECK:       amdaie.dma_cpy_nd
+//       CHECK:       amdaie.dma_cpy_nd
+//       CHECK:       %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_0]]}
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:       }
+//       CHECK:       %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]}
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1]
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:       }
+//       CHECK:   memref.dealloc %[[L2_ALLOC_0]] : memref<1x1x32x32xi32, 1 : i32>
+//       CHECK:   memref.dealloc %[[L2_ALLOC_1]] : memref<1x1x32x32xi32, 1 : i32>
+#map = affine_map<(d0) -> (d0 * 64)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>
+#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>
+#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
+module {
+  func.func @attribute_offset_for_l3_source(%arg0: !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, %arg2: !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, %arg3: !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>) {
+    %c3 = arith.constant 3 : index
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
+    %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+    %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
+    %alloc_2 = memref.alloc() : memref<128x128xi32>
+    %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+    %tile = amdaie.tile(%c1, %c3)
+    %tile_4 = amdaie.tile(%c0, %c2)
+    %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
+    %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+    scf.forall (%arg4, %arg5) in (2, 2) {
+      %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, 3, 2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %tile_5 = amdaie.tile(%c1, %c3)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+      %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
+      %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
+      %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %11 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %12 = amdaie.core(%tile_5, in : [%7, %8, %10], out : [%11]) {
+        %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.muli %in, %in_6 : i32
+          %22 = arith.addi %out, %21 : i32
+          linalg.yield %22 : i32
+        }
+        %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.addi %in, %in_6 : i32
+          linalg.yield %21 : i32
+        }
+        amdaie.end
+      }
+      %13 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %14 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %15 = amdaie.core(%tile_4, in : [%7, %8, %14], out : [%11]) {
+        %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.muli %in, %in_6 : i32
+          %22 = arith.addi %out, %21 : i32
+          linalg.yield %22 : i32
+        }
+        %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.addi %in, %in_6 : i32
+          linalg.yield %21 : i32
+        }
+        amdaie.end
+      }
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32>
+    memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_2 : memref<128x128xi32>
+    return
+  }
+}
+
+// -----
+
+// Test splitting of logicalobjectfifo when offset of L3 source is a function argument.
+
+//   CHECK-DAG: #map = affine_map<(d0) -> (d0 + 32)>
+//   CHECK-DAG: #map1 = affine_map<(d0) -> (d0)>
+//       CHECK: @block_argument_of_funcOp_offset
+//  CHECK-SAME:   %[[ARG0:.*]]: index, %[[ARG1:.*]]: index,
+//   CHECK-DAG:   %[[IV1_32:.*]] = affine.apply #map(%[[ARG1]])
+//   CHECK-DAG:   %[[IV0_32:.*]] = affine.apply #map(%[[ARG0]])
+//   CHECK-DAG:   %[[IV1_0:.*]] = affine.apply #map1(%[[ARG1]])
+//   CHECK-DAG:   %[[IV0_0:.*]] = affine.apply #map1(%[[ARG0]])
+//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
+//   CHECK-DAG:   %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32>
+//   CHECK-DAG:   %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+//   CHECK-DAG:   %[[TILE_0:.*]] = amdaie.tile(%[[C1]], %[[C3]])
+//   CHECK-DAG:   %[[TILE_1:.*]] = amdaie.tile(%[[C0]], %[[C2]])
+//       CHECK:   %[[L2_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_0]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+//       CHECK:   %[[L2_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_1]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+//       CHECK:   %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+//       CHECK:   scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2)
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, %[[IV0_0:.*]], %[[IV1_0:.*]]] [1, 1, 32, 32] [4096, 32, 128, 1]
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32:.*]], %[[IV1_32:.*]]] [1, 1, 32, 32] [4096, 32, 128, 1]
+//       CHECK:       amdaie.logicalobjectfifo.from_memref
+//       CHECK:       amdaie.logicalobjectfifo.from_memref
+//       CHECK:       amdaie.dma_cpy_nd
+//       CHECK:       amdaie.dma_cpy_nd
+//       CHECK:       %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_0]]}
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:       }
+//       CHECK:       %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]}
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1]
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:       }
+//       CHECK:   memref.dealloc %[[L2_ALLOC_0]] : memref<1x1x32x32xi32, 1 : i32>
+//       CHECK:   memref.dealloc %[[L2_ALLOC_1]] : memref<1x1x32x32xi32, 1 : i32>
+#map = affine_map<(d0) -> (d0 * 64)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>
+#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>
+#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
+module {
+  func.func @block_argument_of_funcOp_offset(%3: index, %2: index, %arg0: !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, %arg2: !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, %arg3: !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>) {
+    %c3 = arith.constant 3 : index
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
+    %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+    %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
+    %alloc_2 = memref.alloc() : memref<128x128xi32>
+    %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+    %tile = amdaie.tile(%c1, %c3)
+    %tile_4 = amdaie.tile(%c0, %c2)
+    %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
+    %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+    scf.forall (%arg4, %arg5) in (2, 2) {
+      %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %tile_5 = amdaie.tile(%c1, %c3)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+      %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
+      %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
+      %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %11 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %12 = amdaie.core(%tile_5, in : [%7, %8, %10], out : [%11]) {
+        %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.muli %in, %in_6 : i32
+          %22 = arith.addi %out, %21 : i32
+          linalg.yield %22 : i32
+        }
+        %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.addi %in, %in_6 : i32
+          linalg.yield %21 : i32
+        }
+        amdaie.end
+      }
+      %13 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %14 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %15 = amdaie.core(%tile_4, in : [%7, %8, %14], out : [%11]) {
+        %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.muli %in, %in_6 : i32
+          %22 = arith.addi %out, %21 : i32
+          linalg.yield %22 : i32
+        }
+        %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.addi %in, %in_6 : i32
+          linalg.yield %21 : i32
+        }
+        amdaie.end
+      }
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32>
+    memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_2 : memref<128x128xi32>
+    return
+  }
+}
+
+// -----
+
+// Test splitting of logicalobjectfifo when offset of L3 source is a forall induction variable.
+
+//   CHECK-DAG: #map = affine_map<(d0) -> (d0 + 32)>
+//   CHECK-DAG: #map1 = affine_map<(d0) -> (d0)>
+//       CHECK: @block_argument_of_scf_forall_offset
+//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
+//   CHECK-DAG:   %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32>
+//   CHECK-DAG:   %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+//   CHECK-DAG:   %[[TILE_0:.*]] = amdaie.tile(%[[C1]], %[[C3]])
+//   CHECK-DAG:   %[[TILE_1:.*]] = amdaie.tile(%[[C0]], %[[C2]])
+//       CHECK:   %[[L2_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_0]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+//       CHECK:   %[[L2_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_1]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+//       CHECK:   %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} :
+//  CHECK-SAME:         memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+//       CHECK:   scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2)
+//   CHECK-DAG:       %[[IV1_32:.*]] = affine.apply #map(%[[IV1]])
+//   CHECK-DAG:       %[[IV1_0:.*]] = affine.apply #map1(%[[IV1]])
+//   CHECK-DAG:       %[[IV0_32:.*]] = affine.apply #map(%[[IV0]])
+//   CHECK-DAG:       %[[IV0_0:.*]] = affine.apply #map1(%[[IV0]])
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, %[[IV0_0:.*]], %[[IV1_0:.*]]] [1, 1, 32, 32] [4096, 32, 128, 1]
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32:.*]], %[[IV1_32:.*]]] [1, 1, 32, 32] [4096, 32, 128, 1]
+//       CHECK:       amdaie.logicalobjectfifo.from_memref
+//       CHECK:       amdaie.logicalobjectfifo.from_memref
+//       CHECK:       amdaie.dma_cpy_nd
+//       CHECK:       amdaie.dma_cpy_nd
+//       CHECK:       %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_0]]}
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:       }
+//       CHECK:       %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]}
+//       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1]
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
+//       CHECK:       amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out :
+//       CHECK:         linalg.generic
+//       CHECK:       }
+//       CHECK:   memref.dealloc %[[L2_ALLOC_0]] : memref<1x1x32x32xi32, 1 : i32>
+//       CHECK:   memref.dealloc %[[L2_ALLOC_1]] : memref<1x1x32x32xi32, 1 : i32>
+#map = affine_map<(d0) -> (d0 * 64)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>
+#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>
+#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
+module {
+  func.func @block_argument_of_scf_forall_offset(%arg0: !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, %arg2: !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, %arg3: !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>) {
+    %c3 = arith.constant 3 : index
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
+    %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+    %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
+    %alloc_2 = memref.alloc() : memref<128x128xi32>
+    %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+    %tile = amdaie.tile(%c1, %c3)
+    %tile_4 = amdaie.tile(%c0, %c2)
+    %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
+    %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+    scf.forall (%arg4, %arg5) in (2, 2) {
+      %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %arg4, %arg5] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %tile_5 = amdaie.tile(%c1, %c3)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+      %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
+      %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
+      %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %11 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %12 = amdaie.core(%tile_5, in : [%7, %8, %10], out : [%11]) {
+        %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.muli %in, %in_6 : i32
+          %22 = arith.addi %out, %21 : i32
+          linalg.yield %22 : i32
+        }
+        %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.addi %in, %in_6 : i32
+          linalg.yield %21 : i32
+        }
+        amdaie.end
+      }
+      %13 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %14 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %15 = amdaie.core(%tile_4, in : [%7, %8, %14], out : [%11]) {
+        %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.muli %in, %in_6 : i32
+          %22 = arith.addi %out, %21 : i32
+          linalg.yield %22 : i32
+        }
+        %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.addi %in, %in_6 : i32
+          linalg.yield %21 : i32
+        }
+        amdaie.end
+      }
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32>
+    memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_2 : memref<128x128xi32>
+    return
+  }
+}
+
+// -----
+
+// Since the L2->L1 DmaOps have different logicalobjectfifo source, splitting won't take place.
+
+// CHECK-LABEL: @different_logical_objectfifo
+//   CHECK-NOT:     memref<1x1x32x32xi32, 1 : i32>
+#map = affine_map<(d0) -> (d0 * 64)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>
+#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>
+#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
+module {
+  func.func @different_logical_objectfifo(%arg0: !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, %arg2: !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, %arg3: !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>) {
+    %c3 = arith.constant 3 : index
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
+    %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+    %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
+    %alloc_2 = memref.alloc() : memref<128x128xi32>
+    %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+    %tile = amdaie.tile(%c1, %c3)
+    %tile_4 = amdaie.tile(%c0, %c2)
+    %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
+    %00 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
+    %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+    scf.forall (%arg4, %arg5) in (2, 2) {
+      %2 = affine.apply #map(%arg5)
+      %3 = affine.apply #map(%arg4)
+      %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %tile_5 = amdaie.tile(%c1, %c3)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+      %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
+      %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
+      %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %11 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %12 = amdaie.core(%tile_5, in : [%7, %8, %10], out : [%11]) {
+        %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.muli %in, %in_6 : i32
+          %22 = arith.addi %out, %21 : i32
+          linalg.yield %22 : i32
+        }
+        %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.addi %in, %in_6 : i32
+          linalg.yield %21 : i32
+        }
+        amdaie.end
+      }
+      %13 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %14 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %00[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %15 = amdaie.core(%tile_4, in : [%7, %8, %14], out : [%11]) {
+        %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.muli %in, %in_6 : i32
+          %22 = arith.addi %out, %21 : i32
+          linalg.yield %22 : i32
+        }
+        %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.addi %in, %in_6 : i32
+          linalg.yield %21 : i32
+        }
+        amdaie.end
+      }
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32>
+    memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_2 : memref<128x128xi32>
+    return
+  }
+}
+
+// -----
+
+// We want to compute L3's source offset by computing : L2 source offset * L2 target size at dim.
+// But since in this test L2 source offset is not a constant, the above computation cannot take place.
+
+// CHECK-LABEL: @non_constant_source_l2_offset
+//   CHECK-NOT:     memref<1x1x32x32xi32, 1 : i32>
+#map = affine_map<(d0) -> (d0 * 64)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>
+#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>
+#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
+module {
+  func.func @non_constant_source_l2_offset(%cst_offset: index, %arg0: !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, %arg2: !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, %arg3: !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>) {
+    %c3 = arith.constant 3 : index
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
+    %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+    %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
+    %alloc_2 = memref.alloc() : memref<128x128xi32>
+    %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+    %tile = amdaie.tile(%c1, %c3)
+    %tile_4 = amdaie.tile(%c0, %c2)
+    %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
+    %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+    scf.forall (%arg4, %arg5) in (2, 2) {
+      %2 = affine.apply #map(%arg5)
+      %3 = affine.apply #map(%arg4)
+      %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %tile_5 = amdaie.tile(%c1, %c3)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+      %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
+      %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
+      %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %11 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %12 = amdaie.core(%tile_5, in : [%7, %8, %10], out : [%11]) {
+        %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.muli %in, %in_6 : i32
+          %22 = arith.addi %out, %21 : i32
+          linalg.yield %22 : i32
+        }
+        %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.addi %in, %in_6 : i32
+          linalg.yield %21 : i32
+        }
+        amdaie.end
+      }
+      %13 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %14 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, %cst_offset, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %15 = amdaie.core(%tile_4, in : [%7, %8, %14], out : [%11]) {
+        %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.muli %in, %in_6 : i32
+          %22 = arith.addi %out, %21 : i32
+          linalg.yield %22 : i32
+        }
+        %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.addi %in, %in_6 : i32
+          linalg.yield %21 : i32
+        }
+        amdaie.end
+      }
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32>
+    memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_2 : memref<128x128xi32>
+    return
+  }
+}
+
+// -----
+
+// We want to compute L3's source offset by computing : L2 source offset * L2 target size at dim.
+// But since in this test L2 target size is not a constant, the above computation cannot take place.
+
+// CHECK-LABEL: @non_constant_target_l2_size
+//   CHECK-NOT:     memref<1x1x32x32xi32, 1 : i32>
+#map = affine_map<(d0) -> (d0 * 64)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>
+#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>
+#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
+module {
+  func.func @non_constant_target_l2_size(%cst_offset: index, %arg0: !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, %arg2: !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, %arg3: !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>) {
+    %c3 = arith.constant 3 : index
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
+    %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+    %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
+    %alloc_2 = memref.alloc() : memref<128x128xi32>
+    %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+    %tile = amdaie.tile(%c1, %c3)
+    %tile_4 = amdaie.tile(%c0, %c2)
+    %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
+    %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+    scf.forall (%arg4, %arg5) in (2, 2) {
+      %2 = affine.apply #map(%arg5)
+      %3 = affine.apply #map(%arg4)
+      %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, %cst_offset, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %tile_5 = amdaie.tile(%c1, %c3)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+      %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
+      %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
+      %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %11 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %12 = amdaie.core(%tile_5, in : [%7, %8, %10], out : [%11]) {
+        %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.muli %in, %in_6 : i32
+          %22 = arith.addi %out, %21 : i32
+          linalg.yield %22 : i32
+        }
+        %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.addi %in, %in_6 : i32
+          linalg.yield %21 : i32
+        }
+        amdaie.end
+      }
+      %13 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %14 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %15 = amdaie.core(%tile_4, in : [%7, %8, %14], out : [%11]) {
+        %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.muli %in, %in_6 : i32
+          %22 = arith.addi %out, %21 : i32
+          linalg.yield %22 : i32
+        }
+        %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.addi %in, %in_6 : i32
+          linalg.yield %21 : i32
+        }
+        amdaie.end
+      }
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32>
+    memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_2 : memref<128x128xi32>
+    return
+  }
+}
+
+// -----
+
+#map = affine_map<(d0) -> (d0 * 64)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>
+#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>
+#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
+module {
+  func.func @unhadled_offset_expression(%arg0: !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, %arg2: !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, %arg3: !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>) {
+    %c3 = arith.constant 3 : index
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %2 = arith.addi %c0, %c2 : index
+    %3 = arith.addi %c0, %c3 : index
+    %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
+    %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+    %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
+    %alloc_2 = memref.alloc() : memref<128x128xi32>
+    %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+    %tile = amdaie.tile(%c1, %c3)
+    %tile_4 = amdaie.tile(%c0, %c2)
+    %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
+    %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+    scf.forall (%arg4, %arg5) in (2, 2) {
+      // expected-error @below {{Unhandled expression for source offset at dim 2}}
+      %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %tile_5 = amdaie.tile(%c1, %c3)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+      %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
+      %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
+      %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %11 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %12 = amdaie.core(%tile_5, in : [%7, %8, %10], out : [%11]) {
+        %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.muli %in, %in_6 : i32
+          %22 = arith.addi %out, %21 : i32
+          linalg.yield %22 : i32
+        }
+        %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.addi %in, %in_6 : i32
+          linalg.yield %21 : i32
+        }
+        amdaie.end
+      }
+      %13 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %14 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %15 = amdaie.core(%tile_4, in : [%7, %8, %14], out : [%11]) {
+        %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
+        %18 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%16, %17 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%18 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.muli %in, %in_6 : i32
+          %22 = arith.addi %out, %21 : i32
+          linalg.yield %22 : i32
+        }
+        %19 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        %20 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
+        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18, %19 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
+        ^bb0(%in: i32, %in_6: i32, %out: i32):
+          %21 = arith.addi %in, %in_6 : i32
+          linalg.yield %21 : i32
+        }
+        amdaie.end
+      }
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32>
+    memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_2 : memref<128x128xi32>
+    return
+  }
+}