From f9526c92f7201a08a427a4c2112c918718e99e1a Mon Sep 17 00:00:00 2001
From: Jorn Tuyls <jtuyls@users.noreply.github.com>
Date: Wed, 18 Sep 2024 15:59:44 +0200
Subject: [PATCH] Delete StatefulTransform and move logic into AMDAIE passes
 (#784)

This PR removes `AMDAIEStatefulTransform` and `AMDAIEAssignLockIDs` by
moving logic into `AMDAIEBufferization`, `AMDAIEAcquireReleaseToUseLock`
and `AMDAIELowerToAIE`. This gets rid of an intermediate layer of logic
operating on `aie.objectfifo` by going from logical objectFifos directly
to aie.buffer. This reduces the overall amount of code/complexity needed
to get to the same result.
---
 .../AMD-AIE/aie/AMDAIEAssignLockIDs.cpp       |  118 --
 .../aie/AMDAIEObjectFifoStatefulTransform.cpp |  797 ----------
 .../plugins/target/AMD-AIE/aie/CMakeLists.txt |    2 -
 compiler/plugins/target/AMD-AIE/aie/Passes.h  |    2 -
 .../aie/test/AIE2_cyclostatic_dma.mlir        |  181 ---
 .../AMD-AIE/aie/test/AIE2_cyclostatic_l1.mlir |  182 ---
 .../AMD-AIE/aie/test/AIE2_cyclostatic_l2.mlir |  244 ---
 .../aie/test/AIE2_delayed_release.mlir        |  125 --
 .../AMD-AIE/aie/test/assign-lockIDs.mlir      |  129 --
 .../AMD-AIE/aie/test/base_test_AIE1.mlir      |  123 --
 .../AMD-AIE/aie/test/base_test_AIE2.mlir      |  123 --
 .../AMD-AIE/aie/test/broadcast_test.mlir      |  374 -----
 .../AMD-AIE/aie/test/link_test_AIE1.mlir      |   80 -
 .../AMD-AIE/aie/test/link_test_DDR_to_L1.mlir |   80 -
 .../AMD-AIE/aie/test/link_test_L1_to_DDR.mlir |   81 -
 .../AMD-AIE/aie/test/link_test_broadcast.mlir |  136 --
 .../aie/test/link_test_distribute.mlir        |  155 --
 .../AMD-AIE/aie/test/link_test_join.mlir      |  191 ---
 .../target/AMD-AIE/aie/test/matmul_test.mlir  |  188 ---
 .../target/AMD-AIE/aie/test/memTile_test.mlir |   55 -
 .../AMD-AIE/aie/test/nd_dma_base_AIE2.mlir    |  126 --
 .../aie/test/nd_dma_distribute_AIE2.mlir      |  123 --
 .../test/nd_dma_multiple_consumers_AIE2.mlir  |  201 ---
 .../AMD-AIE/aie/test/nested_loop_test.mlir    |  365 -----
 .../aie/test/non_adjacency_test_1.mlir        |  125 --
 .../aie/test/non_adjacency_test_2.mlir        |  139 --
 .../aie/test/non_adjacency_test_AIE2.mlir     |  122 --
 .../test/register_external_buffers_test.mlir  |   75 -
 .../same_core_producer_consumer_test.mlir     |  103 --
 .../AMD-AIE/aie/test/shimRow_mem_test.mlir    |   75 -
 .../AMD-AIE/aie/test/shim_AIE2_test.mlir      |   68 -
 .../AMD-AIE/aie/test/shim_broadcast_test.mlir |   88 --
 .../AMD-AIE/aie/test/subview_test_1.mlir      |  132 --
 .../AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td    |   11 +
 .../AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp     |   10 +-
 .../AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td      |   40 +-
 .../iree-amd-aie/PluginRegistration.cpp       |    2 -
 .../AMDAIEAcquireReleaseToUseLock.cpp         |  234 +++
 .../Transforms/AMDAIECoreLoopUnroll.cpp       |   84 --
 .../Transforms/AMDAIELowerToAIE.cpp           | 1000 ++++++------
 .../Transforms/AMDAIELowerToAIE.h             |  132 ++
 .../iree-amd-aie/Transforms/CMakeLists.txt    |    3 +-
 .../iree-amd-aie/Transforms/PassDetail.h      |    2 +-
 .../iree-amd-aie/Transforms/Passes.cpp        |    9 +-
 .../AMD-AIE/iree-amd-aie/Transforms/Passes.h  |    8 +-
 .../AMD-AIE/iree-amd-aie/Transforms/Passes.td |   13 +-
 .../Transforms/test/CMakeLists.txt            |    2 +-
 .../test/acquire_release_to_use_lock.mlir     |  214 +++
 .../Transforms/test/core_loop_unroll.mlir     |  181 ---
 .../Transforms/test/lower_to_aie.mlir         | 1335 +++++++++++------
 .../aie_runtime/Utils/CMakeLists.txt          |    1 +
 51 files changed, 2092 insertions(+), 6297 deletions(-)
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/AMDAIEAssignLockIDs.cpp
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/AMDAIEObjectFifoStatefulTransform.cpp
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_dma.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_l1.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_l2.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/AIE2_delayed_release.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/assign-lockIDs.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/base_test_AIE1.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/base_test_AIE2.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/broadcast_test.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/link_test_AIE1.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/link_test_DDR_to_L1.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/link_test_L1_to_DDR.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/link_test_broadcast.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/link_test_distribute.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/link_test_join.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/matmul_test.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/memTile_test.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/nd_dma_base_AIE2.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/nd_dma_distribute_AIE2.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/nd_dma_multiple_consumers_AIE2.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/nested_loop_test.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_1.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_2.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_AIE2.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/register_external_buffers_test.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/same_core_producer_consumer_test.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/shimRow_mem_test.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/shim_AIE2_test.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/shim_broadcast_test.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/aie/test/subview_test_1.mlir
 create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAcquireReleaseToUseLock.cpp
 delete mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECoreLoopUnroll.cpp
 create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h
 create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/acquire_release_to_use_lock.mlir
 delete mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/core_loop_unroll.mlir

diff --git a/compiler/plugins/target/AMD-AIE/aie/AMDAIEAssignLockIDs.cpp b/compiler/plugins/target/AMD-AIE/aie/AMDAIEAssignLockIDs.cpp
deleted file mode 100644
index 3d6efe6fc..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/AMDAIEAssignLockIDs.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-// Copyright 2024 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-// This pass aims to assign lockIDs to AIE.lock operations. The lockID is
-// numbered from the most recent AIE.lock within the same tile. If the lockID
-// exceeds the number of locks on the tile, the pass generates an error and
-// terminates. AIE.lock operations for different tiles are numbered
-// independently. If there are existing lock IDs, this pass is idempotent
-// and only assigns lock IDs to locks without an ID.
-
-#include "AIEDialect.h"
-#include "Passes.h"
-#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
-#include "llvm/ADT/DenseMap.h"
-#include "mlir/Pass/Pass.h"
-
-#define DEBUG_TYPE "amdaie-assign-lock-ids"
-
-using namespace mlir;
-using namespace xilinx;
-using namespace xilinx::AIE;
-
-namespace mlir::iree_compiler::AMDAIE {
-struct AMDAIEAssignLockIDsPass : mlir::OperationPass<DeviceOp> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(AMDAIEAssignLockIDsPass)
-
-  AMDAIEAssignLockIDsPass() : mlir::OperationPass<DeviceOp>(resolveTypeID()) {}
-
-  llvm::StringRef getArgument() const override {
-    return "amdaie-assign-lock-ids";
-  }
-
-  llvm::StringRef getName() const override { return "AMDAIEAssignLockIDsPass"; }
-
-  std::unique_ptr<mlir::Pass> clonePass() const override {
-    return std::make_unique<AMDAIEAssignLockIDsPass>(
-        *static_cast<const AMDAIEAssignLockIDsPass *>(this));
-  }
-
-  void runOnOperation() override {
-    DeviceOp device = getOperation();
-    OpBuilder rewriter = OpBuilder::atBlockEnd(device.getBody());
-
-    // All of the lock ops on a tile, separated into ops which have been
-    // assigned to a lock, and ops which have not.
-    struct TileLockOps {
-      DenseSet<int> assigned;
-      SmallVector<LockOp> unassigned;
-    };
-
-    DenseMap<TileOp, TileLockOps> tileToLocks;
-
-    // Construct data structure storing locks by tile.
-    device.walk<WalkOrder::PreOrder>([&](LockOp lockOp) {
-      TileOp tileOp = xilinx::AIE::getTileOp(*lockOp);
-      if (lockOp.getLockID().has_value()) {
-        auto lockID = lockOp.getLockID().value();
-        auto iter = tileToLocks.find(tileOp);
-        if (iter == tileToLocks.end())
-          tileToLocks.insert({tileOp, {{lockID}, /* unassigned = */ {}}});
-        else {
-          if (iter->second.assigned.find(lockID) !=
-              iter->second.assigned.end()) {
-            auto diag = lockOp->emitOpError("is assigned to the same lock (")
-                        << lockID << ") as another op.";
-            diag.attachNote(tileOp.getLoc())
-                << "tile has lock ops assigned to same lock.";
-            return signalPassFailure();
-          }
-          iter->second.assigned.insert(lockID);
-        }
-      } else {
-        auto iter = tileToLocks.find(tileOp);
-        if (iter == tileToLocks.end())
-          tileToLocks.insert({tileOp, {/* assigned = */ {}, {lockOp}}});
-        else
-          iter->second.unassigned.push_back(lockOp);
-      }
-    });
-
-    AMDAIEDeviceModel deviceModel = mlir::iree_compiler::AMDAIE::getDeviceModel(
-        static_cast<AMDAIEDevice>(device.getDevice()));
-    // IR mutation: assign locks to all unassigned lock ops.
-    for (auto [tileOp, locks] : tileToLocks) {
-      uint32_t locksPerTile =
-          deviceModel.getNumLocks(tileOp.getCol(), tileOp.getRow());
-      uint32_t nextID = 0;
-      for (auto lockOp : locks.unassigned) {
-        while (nextID < locksPerTile &&
-               (locks.assigned.find(nextID) != locks.assigned.end())) {
-          ++nextID;
-        }
-        if (nextID == locksPerTile) {
-          mlir::InFlightDiagnostic diag =
-              lockOp->emitOpError("not allocated a lock.");
-          diag.attachNote(tileOp.getLoc()) << "because only " << locksPerTile
-                                           << " locks available in this tile.";
-          return signalPassFailure();
-        }
-        lockOp.setLockIDAttr(rewriter.getI8IntegerAttr(nextID));
-        ++nextID;
-      }
-    }
-  }
-};
-std::unique_ptr<OperationPass<DeviceOp>> createAMDAIEAssignLockIDsPass() {
-  return std::make_unique<AMDAIEAssignLockIDsPass>();
-}
-
-void registerAMDAIEAssignLockIDs() {
-  mlir::registerPass([]() -> std::unique_ptr<mlir::Pass> {
-    return createAMDAIEAssignLockIDsPass();
-  });
-}
-}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/aie/AMDAIEObjectFifoStatefulTransform.cpp b/compiler/plugins/target/AMD-AIE/aie/AMDAIEObjectFifoStatefulTransform.cpp
deleted file mode 100644
index 5b4d3e6e3..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/AMDAIEObjectFifoStatefulTransform.cpp
+++ /dev/null
@@ -1,797 +0,0 @@
-// Copyright 2024 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "AIEDialect.h"
-#include "Passes.h"
-#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
-#include "llvm/ADT/SetVector.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
-
-#define DEBUG_TYPE "amdaie-objectFifo-stateful-transform"
-
-using namespace mlir;
-using namespace mlir::iree_compiler::AMDAIE;
-
-using xilinx::AIE::AIEObjectFifoType;
-using xilinx::AIE::BDDimLayoutArrayAttr;
-using xilinx::AIE::BufferOp;
-using xilinx::AIE::CoreOp;
-using xilinx::AIE::DeviceOp;
-using xilinx::AIE::DMABDOp;
-using xilinx::AIE::DMAStartOp;
-using xilinx::AIE::EndOp;
-using xilinx::AIE::FlowOp;
-using xilinx::AIE::LockAction;
-using xilinx::AIE::LockOp;
-using xilinx::AIE::MemOp;
-using xilinx::AIE::MemTileDMAOp;
-using xilinx::AIE::NextBDOp;
-using xilinx::AIE::ObjectFifoAcquireOp;
-using xilinx::AIE::ObjectFifoCreateOp;
-using xilinx::AIE::ObjectFifoLinkOp;
-using xilinx::AIE::ObjectFifoPort;
-using xilinx::AIE::ObjectFifoReleaseOp;
-using xilinx::AIE::ObjectFifoSubviewAccessOp;
-using xilinx::AIE::ShimDMAAllocationOp;
-using xilinx::AIE::TileOp;
-using xilinx::AIE::UseLockOp;
-
-namespace {
-
-struct LockResources {
-  // Reference to the producer and consumer lock ops created for this resource.
-  std::pair<LockOp, LockOp> locks;
-  // The acquire and release values to be used for producer and consumer locks
-  // for this resource.
-  std::pair<uint8_t, uint8_t> locksAcqRel;
-  LockResources() {}
-  LockResources(const std::pair<LockOp, LockOp> &locks,
-                const std::pair<uint8_t, uint8_t> &locksAcqRel)
-      : locks(locks), locksAcqRel(locksAcqRel) {}
-};
-
-struct ObjectFifoEndpointResource {
-  // The buffers used for this objectFifo endpoint (multiple: double buffering).
-  SmallVector<BufferOp> buffers;
-  // The lock resources used for this objectFifo endpoint.
-  LockResources lockResources;
-  ObjectFifoEndpointResource() {}
-  ObjectFifoEndpointResource(const SmallVector<BufferOp> &buffers,
-                             LockResources &&lockResources)
-      : buffers(buffers), lockResources(std::move(lockResources)) {}
-};
-
-struct ObjectFifoResources {
-  // Offset on the producer's side of the objectFifo.
-  uint32_t producerOffset{0};
-  ObjectFifoEndpointResource producerResource;
-  // Offset on the consumers' side of the objectFifo.
-  uint32_t consumersOffset{0};
-  DenseMap<TileOp, ObjectFifoEndpointResource> consumerResources;
-  ObjectFifoResources() {}
-  ObjectFifoResources(uint32_t producerOffset, uint32_t consumersOffset)
-      : producerOffset(producerOffset), consumersOffset(consumersOffset) {}
-};
-
-SmallVector<ObjectFifoCreateOp> getInputObjectFifos(ObjectFifoLinkOp &op) {
-  SmallVector<ObjectFifoCreateOp> inputObjFifos;
-  Operation *parent = op.getOperation();
-  while ((parent = parent->getParentOp())) {
-    if (parent->hasTrait<OpTrait::SymbolTable>()) {
-      for (auto sym : op.getFifoIns()) {
-        auto name = dyn_cast<FlatSymbolRefAttr>(sym);
-        if (auto *st = SymbolTable::lookupSymbolIn(parent, name);
-            isa_and_nonnull<ObjectFifoCreateOp>(st))
-          inputObjFifos.push_back(dyn_cast<ObjectFifoCreateOp>(st));
-      }
-    }
-  }
-  return inputObjFifos;
-}
-
-SmallVector<ObjectFifoCreateOp> getOutputObjectFifos(ObjectFifoLinkOp &op) {
-  SmallVector<ObjectFifoCreateOp> outputObjFifos;
-  Operation *parent = op.getOperation();
-  while ((parent = parent->getParentOp())) {
-    if (parent->hasTrait<OpTrait::SymbolTable>()) {
-      for (auto sym : op.getFifoOuts()) {
-        auto name = dyn_cast<FlatSymbolRefAttr>(sym);
-        if (auto *st = SymbolTable::lookupSymbolIn(parent, name);
-            isa_and_nonnull<ObjectFifoCreateOp>(st))
-          outputObjFifos.push_back(dyn_cast<ObjectFifoCreateOp>(st));
-      }
-    }
-  }
-  return outputObjFifos;
-}
-
-int objFifoSize(ObjectFifoCreateOp op, int index = 0) {
-  if (llvm::isa<mlir::ArrayAttr>(op.getElemNumber())) {
-    return llvm::dyn_cast<mlir::IntegerAttr>(
-               llvm::dyn_cast<mlir::ArrayAttr>(op.getElemNumber())[index])
-        .getInt();
-  } else {
-    return llvm::dyn_cast<mlir::IntegerAttr>(op.getElemNumber()).getInt();
-  }
-}
-
-template <typename T>
-ObjectFifoCreateOp getObjectFifo(T op) {
-  Operation *parent = op.getOperation();
-  while ((parent = parent->getParentOp())) {
-    if (parent->hasTrait<OpTrait::SymbolTable>()) {
-      if (auto *st = SymbolTable::lookupSymbolIn(parent, op.getObjFifoName());
-          isa_and_nonnull<ObjectFifoCreateOp>(st))
-        return dyn_cast<ObjectFifoCreateOp>(st);
-    }
-  }
-  return {};
-}
-
-bool isJoin(ObjectFifoLinkOp op) {
-  return op.getFifoIns().size() > 1 && op.getFifoOuts().size() == 1;
-}
-
-bool isDistribute(ObjectFifoLinkOp op) {
-  return op.getFifoOuts().size() > 1 && op.getFifoIns().size() == 1;
-}
-
-bool isOneToOne(ObjectFifoLinkOp op) {
-  return op.getFifoIns().size() == 1 && op.getFifoOuts().size() == 1;
-}
-
-/// Retrieve ObjectFifoLinkOp of ObjectFifoCreateOp,
-/// if it belongs to one.
-std::optional<ObjectFifoLinkOp> getOptionalLinkOp(ObjectFifoCreateOp op) {
-  auto device = op->getParentOfType<DeviceOp>();
-  for (ObjectFifoLinkOp linkOp : device.getOps<ObjectFifoLinkOp>()) {
-    for (ObjectFifoCreateOp in : getInputObjectFifos(linkOp))
-      if (in == op) return {linkOp};
-    for (ObjectFifoCreateOp out : getOutputObjectFifos(linkOp))
-      if (out == op) return {linkOp};
-  }
-  return {};
-}
-
-}  // namespace
-
-template <typename MemOp>
-void createDMA(DeviceOp &device, OpBuilder &builder, TileOp tileOp,
-               DMAChannelDir channelDir, int channelIndex,
-               BDDimLayoutArrayAttr dims, size_t acqNum, size_t relNum,
-               int64_t len, int64_t offset,
-               const SmallVector<BufferOp> &bufferOps,
-               const std::pair<LockOp, LockOp> &locks) {
-  OpBuilder::InsertionGuard g(builder);
-  Operation *producer = nullptr;
-  for (auto memOp : device.getOps<MemOp>()) {
-    if (memOp.getTile() == tileOp.getResult()) {
-      producer = memOp.getOperation();
-      break;
-    }
-  }
-
-  // if none exists, create one
-  if (!producer) {
-    if (device->getNumRegions() != 1)
-      llvm::report_fatal_error("expected num regions for device op");
-    OpBuilder::InsertionGuard gg(builder);
-    builder.setInsertionPointToEnd(device.getBody());
-    auto newMemOp = builder.create<MemOp>(builder.getUnknownLoc(), tileOp);
-    {
-      OpBuilder::InsertionGuard ggg(builder);
-      builder.setInsertionPointToStart(&newMemOp.getRegion().emplaceBlock());
-      builder.create<EndOp>(builder.getUnknownLoc());
-    }
-    producer = newMemOp.getOperation();
-  }
-
-  Block &endBlock = producer->getRegion(0).getBlocks().back();
-  assert(!endBlock.getOps<EndOp>().empty() &&
-         "expected last block to have aie.end");
-  Block *lastDmaBlock = endBlock.getSinglePredecessor(),
-        *dmaBlock = builder.createBlock(&endBlock),
-        *bdBlock = builder.createBlock(&endBlock);
-
-  // create DMA channel
-  {
-    OpBuilder::InsertionGuard gg(builder);
-    builder.setInsertionPointToStart(dmaBlock);
-    builder.create<DMAStartOp>(builder.getUnknownLoc(), channelDir,
-                               channelIndex, /*repeatCount*/ 0, bdBlock,
-                               &endBlock);
-  }
-  if (lastDmaBlock) lastDmaBlock->getTerminator()->setSuccessor(dmaBlock, 1);
-
-  auto createBdBlockOps = [&](BufferOp buff, Block *succ) {
-    LockOp acqLock = locks.first, relLock = locks.second;
-    builder.create<UseLockOp>(builder.getUnknownLoc(), acqLock,
-                              LockAction::AcquireGreaterEqual, acqNum);
-    if (!dims.getValue().empty()) {
-      builder.create<DMABDOp>(builder.getUnknownLoc(), buff, offset, len, dims);
-    } else {
-      builder.create<DMABDOp>(builder.getUnknownLoc(), buff, offset, len);
-    }
-    builder.create<UseLockOp>(builder.getUnknownLoc(), relLock,
-                              LockAction::Release, relNum);
-    builder.create<NextBDOp>(builder.getUnknownLoc(), succ);
-  };
-
-  // create Bd blocks
-  Block *succ = nullptr, *curr = bdBlock;
-  for (size_t blockIndex = 0; blockIndex < bufferOps.size(); ++blockIndex) {
-    if (blockIndex == bufferOps.size() - 1) {
-      succ = bdBlock;
-    } else {
-      succ = builder.createBlock(&endBlock);
-    }
-
-    OpBuilder::InsertionGuard gg(builder);
-    builder.setInsertionPointToStart(curr);
-    createBdBlockOps(bufferOps[blockIndex], succ);
-    curr = succ;
-  }
-}
-
-template <typename MemOp>
-void createTileDMA(DeviceOp &device, OpBuilder &builder, TileOp tileOp,
-                   DMAChannelDir channelDir, uint8_t channelIndex, size_t size,
-                   BDDimLayoutArrayAttr dims, uint32_t offset,
-                   const ObjectFifoEndpointResource &endpointResource) {
-  std::pair<LockOp, LockOp> locks = endpointResource.lockResources.locks;
-  uint8_t acqNum = endpointResource.lockResources.locksAcqRel.first;
-  uint8_t relNum = endpointResource.lockResources.locksAcqRel.second;
-  createDMA<MemOp>(device, builder, tileOp, channelDir, channelIndex, dims,
-                   acqNum, relNum, size, offset, endpointResource.buffers,
-                   locks);
-}
-
-LogicalResult createUseLocks(
-    OpBuilder &builder, ObjectFifoCreateOp op, ObjectFifoPort port,
-    size_t numLocks, LockAction lockAction,
-    const ObjectFifoEndpointResource &endpointResource) {
-  if (numLocks == 0) return failure();
-  LockOp lock;
-  if (lockAction == LockAction::AcquireGreaterEqual) {
-    lock = endpointResource.lockResources.locks.second;
-  } else if (lockAction == LockAction::Release) {
-    lock = endpointResource.lockResources.locks.first;
-  } else {
-    return op.emitOpError() << "unsupported lock action on this resource: "
-                            << stringifyEnum(lockAction);
-  }
-  builder.create<UseLockOp>(builder.getUnknownLoc(), lock, lockAction,
-                            numLocks);
-  return success();
-}
-
-LogicalResult replaceReleaseOp(
-    OpBuilder &builder, ObjectFifoReleaseOp releaseOp, TileOp tileOp,
-    const DenseMap<ObjectFifoCreateOp, ObjectFifoResources> &resourceMap) {
-  OpBuilder::InsertionGuard g(builder);
-  ObjectFifoCreateOp op = getObjectFifo(releaseOp);
-  auto port = releaseOp.getPort();
-  const ObjectFifoEndpointResource &endpointResource =
-      port == ObjectFifoPort::Produce
-          ? resourceMap.at(op).producerResource
-          : resourceMap.at(op).consumerResources.at(tileOp);
-  builder.setInsertionPointAfter(releaseOp);
-  return createUseLocks(builder, op, port, releaseOp.getSize(),
-                        LockAction::Release, endpointResource);
-}
-
-LogicalResult replaceObjectAcquireOp(
-    OpBuilder &builder, ObjectFifoAcquireOp acquireOp, TileOp tileOp,
-    DenseMap<ObjectFifoCreateOp, size_t> &createOpToIndex,
-    const DenseMap<ObjectFifoCreateOp, ObjectFifoResources> &resourceMap) {
-  OpBuilder::InsertionGuard g(builder);
-  ObjectFifoCreateOp op = getObjectFifo(acquireOp);
-  if (!createOpToIndex.contains(op)) createOpToIndex[op] = 0;
-  auto port = acquireOp.getPort();
-  const ObjectFifoEndpointResource &endpointResource =
-      port == ObjectFifoPort::Produce
-          ? resourceMap.at(op).producerResource
-          : resourceMap.at(op).consumerResources.at(tileOp);
-
-  builder.setInsertionPointAfter(acquireOp);
-  if (failed(createUseLocks(builder, op, port, acquireOp.getSize(),
-                            LockAction::AcquireGreaterEqual,
-                            endpointResource))) {
-    return failure();
-  }
-
-  for (Operation *userOp : acquireOp->getUsers()) {
-    auto subviewAccessOp = dyn_cast<ObjectFifoSubviewAccessOp>(userOp);
-    if (!subviewAccessOp) {
-      return acquireOp.emitOpError()
-             << "currently only supports `aie.objectfifo.subview.access` users";
-    }
-    size_t index = subviewAccessOp.getIndex();
-    size_t bufferIndex =
-        (createOpToIndex[op] + index) % endpointResource.buffers.size();
-    BufferOp bufferOp = endpointResource.buffers[bufferIndex];
-    subviewAccessOp.getResult().replaceAllUsesWith(bufferOp.getResult());
-  }
-  // Increment index to rotate through available buffers objectFifo acquires.
-  createOpToIndex[op] += acquireOp.getSize();
-  return success();
-}
-
-/// Utility to create a vector of buffer ops for an objectFifo.
-SmallVector<BufferOp> createBuffers(OpBuilder &builder,
-                                    const AMDAIEDeviceModel &deviceModel,
-                                    ObjectFifoCreateOp createOp,
-                                    size_t numBuffers, TileOp tile,
-                                    const std::string &prefix, size_t index) {
-  SmallVector<BufferOp> buffers;
-  if (deviceModel.isShimTile(tile.getCol(), tile.getRow())) return buffers;
-  auto fifoType = cast<AIEObjectFifoType>(createOp.getElemType());
-  auto elemType = cast<MemRefType>(fifoType.getElementType());
-  for (int ofElemIndex = 0; ofElemIndex < numBuffers; ofElemIndex++) {
-    auto buff = builder.create<BufferOp>(
-        builder.getUnknownLoc(), elemType, tile,
-        builder.getStringAttr(prefix + "_buff_" + std::to_string(index) + "_" +
-                              std::to_string(ofElemIndex)),
-        /*address*/ nullptr,
-        /*mem_bank*/ nullptr);
-    buffers.push_back(buff);
-  }
-  return buffers;
-}
-
-std::pair<LockOp, LockOp> createLockPair(OpBuilder &builder,
-                                         const AMDAIEDeviceModel &deviceModel,
-                                         TileOp tile, int depth,
-                                         const std::string &prefix,
-                                         size_t index) {
-  // TODO(jornt): make this more extensible towards different lock
-  // schemes.
-  int producerInitValue{depth};
-  int consumerInitValue{0};
-  // Use no lock value for shim tiles as the shim DMAs don't need to be
-  // synchronized. TODO(jornt): we might be able to just not create any locks
-  // for shims, see buffers.
-  if (deviceModel.isShimTile(tile.getCol(), tile.getRow()))
-    producerInitValue = 0;
-  LockOp producerLock = builder.create<LockOp>(
-      builder.getUnknownLoc(), tile, IntegerAttr{},
-      builder.getI8IntegerAttr(producerInitValue),
-      builder.getStringAttr(prefix + "_prod_lock_" + std::to_string(index)));
-  LockOp consumerLock = builder.create<LockOp>(
-      builder.getUnknownLoc(), tile, IntegerAttr{},
-      builder.getI8IntegerAttr(consumerInitValue),
-      builder.getStringAttr(prefix + "_cons_lock_" + std::to_string(index)));
-  return std::make_pair(producerLock, consumerLock);
-}
-
-/// Utility to create buffers and locks for the objectFifo producer side.
-LogicalResult createProducerBuffersAndLocks(
-    OpBuilder &builder, const AMDAIEDeviceModel &deviceModel,
-    ObjectFifoCreateOp createOp, size_t index,
-    DenseMap<ObjectFifoCreateOp, ObjectFifoResources> &resourceMap) {
-  OpBuilder::InsertionGuard g(builder);
-  TileOp producerTileOp =
-      dyn_cast_if_present<TileOp>(createOp.getProducerTile().getDefiningOp());
-  if (!producerTileOp) {
-    return createOp.emitOpError() << "expected a producer tile op, but got: "
-                                  << createOp.getProducerTile();
-  }
-  size_t depth = objFifoSize(createOp);
-  SmallVector<BufferOp> producerBuffers =
-      createBuffers(builder, deviceModel, createOp, depth, producerTileOp,
-                    name(createOp).str() + "_prod", index);
-  std::pair<LockOp, LockOp> lockPair =
-      createLockPair(builder, deviceModel, producerTileOp, depth,
-                     name(createOp).str() + "_prod", index);
-  // Swap for producers to synchronize with potential consumers on the other
-  // side.
-  std::swap(lockPair.first, lockPair.second);
-  std::pair<uint8_t, uint8_t> lockAcqRel = std::make_pair(1, 1);
-  resourceMap[createOp].producerResource = ObjectFifoEndpointResource(
-      producerBuffers, LockResources(lockPair, lockAcqRel));
-  return success();
-}
-
-/// Utility to create buffers and locks for the objectFifo consumer side.
-LogicalResult createConsumerBuffersAndLocks(
-    OpBuilder &builder, const AMDAIEDeviceModel &deviceModel,
-    ObjectFifoCreateOp createOp, size_t external_idx,
-    DenseMap<ObjectFifoCreateOp, ObjectFifoResources> &resourceMap) {
-  OpBuilder::InsertionGuard g(builder);
-  resourceMap[createOp].consumerResources.clear();
-  size_t depth = objFifoSize(createOp);
-  for (auto &&[idx1, consumerTile] :
-       llvm::enumerate(createOp.getConsumerTiles())) {
-    size_t idx = external_idx * createOp.getConsumerTiles().size() + idx1;
-    TileOp consumerTileOp =
-        dyn_cast_if_present<TileOp>(consumerTile.getDefiningOp());
-    if (!consumerTileOp) {
-      return createOp.emitOpError()
-             << "expected a consumer tile op, but got: " << consumerTile;
-    }
-    SmallVector<BufferOp> consumerBuffers =
-        createBuffers(builder, deviceModel, createOp, depth, consumerTileOp,
-                      name(createOp).str() + "_cons", idx);
-    std::pair<LockOp, LockOp> lockPair =
-        createLockPair(builder, deviceModel, consumerTileOp, depth,
-                       name(createOp).str() + "_cons", idx);
-    std::pair<uint8_t, uint8_t> lockAcqRel = std::make_pair(1, 1);
-    resourceMap[createOp].consumerResources[consumerTileOp] =
-        ObjectFifoEndpointResource(consumerBuffers,
-                                   LockResources(lockPair, lockAcqRel));
-  }
-  return success();
-}
-
-LogicalResult createBuffersAndLocks(
-    OpBuilder &builder, DeviceOp device, ObjectFifoLinkOp linkOp,
-    DenseMap<ObjectFifoCreateOp, ObjectFifoResources> &resourceMap) {
-  OpBuilder::InsertionGuard g(builder);
-  AMDAIEDeviceModel deviceModel =
-      getDeviceModel(static_cast<AMDAIEDevice>(device.getDevice()));
-
-  SmallVector<ObjectFifoCreateOp> inputs = getInputObjectFifos(linkOp);
-  SmallVector<ObjectFifoCreateOp> outputs = getOutputObjectFifos(linkOp);
-  assert(inputs.size() > 0 && "there should be inputs in the link op");
-  assert(outputs.size() > 0 && "there should be outputs in the link op");
-  uint32_t inputsOffset{0};
-  for (ObjectFifoCreateOp input : inputs) {
-    resourceMap[input] = ObjectFifoResources(0, inputsOffset);
-    auto fifoType = cast<AIEObjectFifoType>(input.getElemType());
-    auto fifoElemType = cast<MemRefType>(fifoType.getElementType());
-    inputsOffset += fifoElemType.getNumElements();
-  }
-  uint32_t outputsOffset{0};
-  for (ObjectFifoCreateOp output : outputs) {
-    resourceMap[output] = ObjectFifoResources(outputsOffset, 0);
-    auto fifoType = cast<AIEObjectFifoType>(output.getElemType());
-    auto fifoElemType = cast<MemRefType>(fifoType.getElementType());
-    outputsOffset += fifoElemType.getNumElements();
-  }
-
-  ObjectFifoCreateOp linkCreateOp;
-  SmallVector<ObjectFifoCreateOp> linkOtherOps;
-  TileOp linkTileOp;
-  if (isJoin(linkOp)) {
-    assert(outputs.size() == 1 && "single output expected");
-    linkCreateOp = outputs[0];
-    linkOtherOps = inputs;
-    linkTileOp = dyn_cast_if_present<TileOp>(
-        linkCreateOp.getProducerTile().getDefiningOp());
-  } else if (isDistribute(linkOp)) {
-    assert(inputs.size() == 1 && "single input expected");
-    linkCreateOp = inputs[0];
-    linkOtherOps = outputs;
-    linkTileOp = dyn_cast_if_present<TileOp>(
-        linkCreateOp.getConsumerTiles()[0].getDefiningOp());
-  } else if (isOneToOne(linkOp)) {
-    auto inFifoType = cast<AIEObjectFifoType>(inputs[0].getElemType());
-    auto inFifoElemType = cast<MemRefType>(inFifoType.getElementType());
-    auto outFifoType = cast<AIEObjectFifoType>(outputs[0].getElemType());
-    auto outFifoElemType = cast<MemRefType>(outFifoType.getElementType());
-    if (inFifoElemType.getNumElements() >= outFifoElemType.getNumElements()) {
-      linkCreateOp = inputs[0];
-      linkOtherOps = outputs;
-      linkTileOp = dyn_cast_if_present<TileOp>(
-          linkCreateOp.getConsumerTiles()[0].getDefiningOp());
-    } else {
-      linkCreateOp = outputs[0];
-      linkOtherOps = inputs;
-      linkTileOp = dyn_cast_if_present<TileOp>(
-          linkCreateOp.getProducerTile().getDefiningOp());
-    }
-  } else {
-    return linkOp.emitOpError()
-           << "only join or distribute link supported currently";
-  }
-  if (!linkTileOp) {
-    return linkCreateOp.emitOpError() << "expected a tile op";
-  }
-
-  size_t depth = objFifoSize(linkCreateOp);
-  if (!depth) return linkCreateOp.emitOpError() << "doesn't have a size";
-
-  // Reset opbuilder location to after the last tile declaration
-  auto tiles = device.getBody()->getOps<TileOp>();
-  assert(!tiles.empty() && "no tiles in device");
-  builder.setInsertionPointAfter(*std::prev(tiles.end(), 1));
-
-  {
-    SmallVector<BufferOp> linkBuffers =
-        createBuffers(builder, deviceModel, linkCreateOp, depth, linkTileOp,
-                      name(linkCreateOp).str() + "_link", 0);
-    size_t linkDepth = depth * linkOtherOps.size();
-    std::pair<LockOp, LockOp> linkLockPair =
-        createLockPair(builder, deviceModel, linkTileOp, linkDepth,
-                       name(linkCreateOp).str() + "_link", 0);
-    uint8_t inputAcqRelValue = linkDepth / depth / inputs.size();
-    std::pair<uint8_t, uint8_t> inputLockAcqRel =
-        std::make_pair(inputAcqRelValue, inputAcqRelValue);
-    for (ObjectFifoCreateOp input : inputs) {
-      resourceMap[input].consumerResources[linkTileOp] =
-          ObjectFifoEndpointResource(
-              linkBuffers, LockResources(linkLockPair, inputLockAcqRel));
-    }
-    // Swap locks for outputs to synchronize link inputs and outputs.
-    std::swap(linkLockPair.first, linkLockPair.second);
-    uint8_t outputAcqRelValue = linkDepth / depth / outputs.size();
-    std::pair<uint8_t, uint8_t> outputLockAcqRel =
-        std::make_pair(outputAcqRelValue, outputAcqRelValue);
-    for (ObjectFifoCreateOp output : outputs) {
-      resourceMap[output].producerResource = ObjectFifoEndpointResource(
-          linkBuffers, LockResources(linkLockPair, outputLockAcqRel));
-    }
-  }
-
-  for (auto &&[idx, input] : llvm::enumerate(inputs)) {
-    if (failed(createProducerBuffersAndLocks(builder, deviceModel, input, idx,
-                                             resourceMap))) {
-      return failure();
-    }
-  }
-
-  for (auto &&[idx, output] : llvm::enumerate(outputs)) {
-    if (failed(createConsumerBuffersAndLocks(builder, deviceModel, output, idx,
-                                             resourceMap))) {
-      return failure();
-    }
-  }
-  return success();
-}
-
-LogicalResult createBuffersAndLocksForNonLinkOps(
-    OpBuilder &builder, DeviceOp device, ObjectFifoCreateOp createOp,
-    DenseMap<ObjectFifoCreateOp, ObjectFifoResources> &resourceMap) {
-  // Skip objectFifoCreateOps in links.
-  if (getOptionalLinkOp(createOp)) return success();
-  OpBuilder::InsertionGuard g(builder);
-  AMDAIEDeviceModel deviceModel =
-      getDeviceModel(static_cast<AMDAIEDevice>(device.getDevice()));
-  resourceMap[createOp] = ObjectFifoResources(0, 0);
-  size_t depth = objFifoSize(createOp);
-  if (!depth) return createOp.emitOpError() << "doesn't have a depth size";
-
-  // Reset opbuilder location to after the last tile declaration
-  auto tiles = device.getBody()->getOps<TileOp>();
-  assert(!tiles.empty() && "no tiles in device");
-  builder.setInsertionPointAfter(*std::prev(tiles.end(), 1));
-  if (failed(createProducerBuffersAndLocks(builder, deviceModel, createOp, 0,
-                                           resourceMap))) {
-    return failure();
-  }
-  if (failed(createConsumerBuffersAndLocks(builder, deviceModel, createOp, 0,
-                                           resourceMap))) {
-    return failure();
-  }
-  return success();
-}
-
-LogicalResult createTileDMAs(
-    OpBuilder &builder, DeviceOp device, ObjectFifoCreateOp createOp,
-    DenseMap<ObjectFifoCreateOp, ObjectFifoResources> &resourceMap,
-    const DenseMap<StringRef, SmallVector<FlowOp>> &symbolToFlowOps) {
-  OpBuilder::InsertionGuard g(builder);
-  AMDAIEDeviceModel deviceModel =
-      getDeviceModel(static_cast<AMDAIEDevice>(device.getDevice()));
-
-  auto createDMA = [&deviceModel, &device, &builder](
-                       TileOp tileOp, DMAChannelDir channelDir,
-                       uint8_t channelIndex, size_t size,
-                       BDDimLayoutArrayAttr dims, StringRef name,
-                       uint32_t offset,
-                       const ObjectFifoEndpointResource &endpointResource) {
-    if (deviceModel.isShimTile(tileOp.getCol(), tileOp.getRow())) {
-      builder.create<ShimDMAAllocationOp>(builder.getUnknownLoc(), name,
-                                          channelDir, channelIndex,
-                                          tileOp.getCol());
-    } else if (deviceModel.isMemTile(tileOp.getCol(), tileOp.getRow())) {
-      createTileDMA<MemTileDMAOp>(device, builder, tileOp, channelDir,
-                                  channelIndex, size, dims, offset,
-                                  endpointResource);
-    } else {
-      createTileDMA<MemOp>(device, builder, tileOp, channelDir, channelIndex,
-                           size, dims, offset, endpointResource);
-    }
-  };
-
-  // Collect producer and consumer DMA channels
-  if (!symbolToFlowOps.contains(createOp.getSymName())) {
-    return createOp.emitOpError()
-           << "symbol name not found in symbol to flow ops map";
-  }
-  SmallVector<FlowOp> flowOps = symbolToFlowOps.at(createOp.getSymName());
-  SmallVector<uint8_t> producerChannelsVec = llvm::map_to_vector(
-      flowOps, [](FlowOp flowOp) { return flowOp.getSourceChannel(); });
-  llvm::SmallSetVector<uint8_t, 1> producerChannels(producerChannelsVec.begin(),
-                                                    producerChannelsVec.end());
-  if (producerChannels.size() != 1)
-    return createOp.emitOpError() << "expected a single producer channel";
-  DenseMap<Value, uint8_t> consumerChannelsMap;
-  for (FlowOp flowOp : flowOps)
-    consumerChannelsMap[flowOp.getDest()] = flowOp.getDestChannel();
-  if (consumerChannelsMap.size() != createOp.getConsumerTiles().size()) {
-    return createOp.emitOpError() << "expected same number of consumers as the "
-                                     "number of objectFifo consumers";
-  }
-
-  auto fifo = cast<AIEObjectFifoType>(createOp.getElemType());
-  auto elemType = cast<MemRefType>(fifo.getElementType());
-  size_t size = elemType.getNumElements();
-
-  // create producer tile DMA
-  builder.setInsertionPoint(&device.getBody()->back());
-  TileOp producerTileOp =
-      dyn_cast_if_present<TileOp>(createOp.getProducerTile().getDefiningOp());
-  if (!producerTileOp)
-    return createOp.emitOpError() << "expected a producer TileOp";
-  const ObjectFifoResources &opResource = resourceMap[createOp];
-  const ObjectFifoEndpointResource &producerEndpointResource =
-      opResource.producerResource;
-  uint32_t producerOffset = opResource.producerOffset;
-  createDMA(producerTileOp, DMAChannelDir::MM2S, producerChannels[0], size,
-            createOp.getDimensionsToStreamAttr(), createOp.getName(),
-            producerOffset, producerEndpointResource);
-
-  assert(opResource.consumerResources.size() ==
-             createOp.getConsumerTiles().size() &&
-         "same number of consumer resources expected as the number of consumer "
-         "tiles on the objectFifo");
-  for (auto &&[idx, consumerTile] :
-       llvm::enumerate(createOp.getConsumerTiles())) {
-    TileOp consumerTileOp =
-        dyn_cast_if_present<TileOp>(consumerTile.getDefiningOp());
-    if (!consumerTileOp) {
-      return createOp.emitOpError()
-             << "expected a consumer TileOp, but got: " << consumerTile;
-    }
-    if (!consumerChannelsMap.contains(consumerTile)) {
-      return createOp.emitOpError()
-             << "did not find consumer tile (" << consumerTile
-             << ") in consumerChannelsMap";
-    }
-    uint8_t consumerChannel = consumerChannelsMap[consumerTile];
-
-    // create consumer tile DMA
-    BDDimLayoutArrayAttr consumerDims =
-        createOp.getDimensionsFromStreamPerConsumer()[idx];
-    uint32_t consumersOffset = opResource.consumersOffset;
-    const ObjectFifoEndpointResource &consumerEndpointResource =
-        opResource.consumerResources.at(consumerTileOp);
-    createDMA(consumerTileOp, DMAChannelDir::S2MM, consumerChannel, size,
-              consumerDims, createOp.getName(), consumersOffset,
-              consumerEndpointResource);
-  }
-  return success();
-}
-
-namespace mlir::iree_compiler::AMDAIE {
-struct AMDAIEObjectFifoStatefulTransformPass : mlir::OperationPass<DeviceOp> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
-      AMDAIEObjectFifoStatefulTransformPass)
-
-  AMDAIEObjectFifoStatefulTransformPass()
-      : mlir::OperationPass<DeviceOp>(resolveTypeID()) {}
-
-  llvm::StringRef getArgument() const override {
-    return "amdaie-objectFifo-stateful-transform";
-  }
-
-  llvm::StringRef getName() const override {
-    return " AMDAIEObjectFifoStatefulTransformPass";
-  }
-
-  std::unique_ptr<mlir::Pass> clonePass() const override {
-    return std::make_unique<AMDAIEObjectFifoStatefulTransformPass>(
-        *static_cast<const AMDAIEObjectFifoStatefulTransformPass *>(this));
-  }
-
-  void getDependentDialects(::mlir::DialectRegistry &registry) const override {
-    registry.insert<mlir::memref::MemRefDialect>();
-    registry.insert<xilinx::AIE::AIEDialect>();
-  }
-
-  void runOnOperation() override {
-    DeviceOp device = getOperation();
-    OpBuilder builder = OpBuilder::atBlockEnd(device.getBody());
-
-    // Flow ops contain the DMA information, so create a map for easy lookup
-    // based on a global symbol.
-    DenseMap<StringRef, SmallVector<FlowOp>> symbolToFlowOps;
-    device.walk([&](FlowOp op) {
-      std::optional<StringRef> symbolAttr = op.getSymbol();
-      if (symbolAttr) symbolToFlowOps[symbolAttr.value()].push_back(op);
-    });
-
-    DenseMap<ObjectFifoCreateOp, ObjectFifoResources> resourceMap;
-    for (ObjectFifoLinkOp linkOp : device.getOps<ObjectFifoLinkOp>()) {
-      if (failed(createBuffersAndLocks(builder, device, linkOp, resourceMap))) {
-        return signalPassFailure();
-      }
-    }
-
-    // Handle objectFifos that are not inside a link.
-    for (ObjectFifoCreateOp createOp : device.getOps<ObjectFifoCreateOp>()) {
-      if (failed(createBuffersAndLocksForNonLinkOps(builder, device, createOp,
-                                                    resourceMap))) {
-        return signalPassFailure();
-      }
-    }
-
-    for (ObjectFifoCreateOp createOp : device.getOps<ObjectFifoCreateOp>()) {
-      if (failed(createTileDMAs(builder, device, createOp, resourceMap,
-                                symbolToFlowOps))) {
-        return signalPassFailure();
-      }
-    }
-
-    // Replace ops
-    for (auto coreOp : device.getOps<CoreOp>()) {
-      TileOp tileOp =
-          dyn_cast_if_present<TileOp>(coreOp.getTile().getDefiningOp());
-      if (!tileOp) {
-        coreOp.emitOpError()
-            << "expected a TileOp, but got: " << coreOp.getTile();
-        return signalPassFailure();
-      }
-      WalkResult res = coreOp.walk([&](ObjectFifoReleaseOp releaseOp) {
-        if (failed(replaceReleaseOp(builder, releaseOp, tileOp, resourceMap))) {
-          return WalkResult::interrupt();
-        }
-        return WalkResult::advance();
-      });
-      if (res.wasInterrupted()) return signalPassFailure();
-      // Use a map from objectFifos to indices to rotate through available
-      // buffers for double buffering purposes.
-      DenseMap<ObjectFifoCreateOp, size_t> createOpToIndex;
-      res = coreOp.walk([&](ObjectFifoAcquireOp acquireOp) {
-        if (failed(replaceObjectAcquireOp(builder, acquireOp, tileOp,
-                                          createOpToIndex, resourceMap))) {
-          return WalkResult::interrupt();
-        }
-        return WalkResult::advance();
-      });
-      if (res.wasInterrupted()) return signalPassFailure();
-    }
-
-    // make global symbols to replace the to be erased ObjectFifoCreateOps
-    for (auto createOp : device.getOps<ObjectFifoCreateOp>()) {
-      OpBuilder::InsertionGuard gg(builder);
-      builder.setInsertionPointToStart(&device.getBodyRegion().front());
-      auto symName = createOp.getName();
-      createOp->setAttr(SymbolTable::getSymbolAttrName(),
-                        builder.getStringAttr("__erase_" + symName));
-      auto memrefType =
-          cast<AIEObjectFifoType>(createOp.getElemType()).getElementType();
-      builder.create<memref::GlobalOp>(builder.getUnknownLoc(), symName,
-                                       builder.getStringAttr("public"),
-                                       memrefType, nullptr, false, nullptr);
-    }
-
-    // Remove old ops
-    IRRewriter rewriter(&getContext());
-    device.walk([&](Operation *op) {
-      if (isa<ObjectFifoCreateOp, ObjectFifoLinkOp, ObjectFifoAcquireOp,
-              ObjectFifoSubviewAccessOp, ObjectFifoReleaseOp>(op)) {
-        op->dropAllUses();
-        rewriter.eraseOp(op);
-      }
-    });
-  }
-};
-
-std::unique_ptr<OperationPass<DeviceOp>>
-createAMDAIEObjectFifoStatefulTransformPass() {
-  return std::make_unique<AMDAIEObjectFifoStatefulTransformPass>();
-}
-
-void registerAMDAIEObjectFifoStatefulTransform() {
-  mlir::registerPass([]() -> std::unique_ptr<mlir::Pass> {
-    return createAMDAIEObjectFifoStatefulTransformPass();
-  });
-}
-
-}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt
index 52244c48a..21167e9b1 100644
--- a/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt
@@ -135,13 +135,11 @@ iree_cc_library(
   SRCS
     AMDAIEAssignBufferAddressesBasic.cpp
     AMDAIEAssignBufferDescriptorIDs.cpp
-    AMDAIEAssignLockIDs.cpp
     AMDAIECoreToStandard.cpp
     AMDAIECreatePathFindFlows.cpp
     AMDAIEDmaToNpu.cpp
     AMDAIELocalizeLocks.cpp
     AMDAIENormalizeAddressSpaces.cpp
-    AMDAIEObjectFifoStatefulTransform.cpp
   DEPS
     iree-amd-aie::aie_runtime::iree_aie_runtime_static
     ::AIEDialectIR
diff --git a/compiler/plugins/target/AMD-AIE/aie/Passes.h b/compiler/plugins/target/AMD-AIE/aie/Passes.h
index bf9e64477..a78c14c50 100644
--- a/compiler/plugins/target/AMD-AIE/aie/Passes.h
+++ b/compiler/plugins/target/AMD-AIE/aie/Passes.h
@@ -37,11 +37,9 @@ createAMDAIEDmaToNpuPass();
 
 void registerAMDAIEAssignBufferAddressesBasic();
 void registerAMDAIEAssignBufferDescriptorIDs();
-void registerAMDAIEAssignLockIDs();
 void registerAMDAIECoreToStandard();
 void registerAMDAIELocalizeLocks();
 void registerAMDAIENormalizeAddressSpaces();
-void registerAMDAIEObjectFifoStatefulTransform();
 void registerAMDAIERoutePathfinderFlows();
 void registerAMDAIEDmaToNpu();
 
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_dma.mlir b/compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_dma.mlir
deleted file mode 100644
index e703e57dd..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_dma.mlir
+++ /dev/null
@@ -1,181 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(xcve2302) {
-// CHECK:           memref.global "public" @fifo : memref<i32>
-// CHECK-DAG:       %[[TILE_2_2:.*]] = aie.tile(2, 2)
-// CHECK-DAG:       %[[TILE_8_3:.*]] = aie.tile(8, 3)
-// CHECK-DAG:       %[[BUFFER_2_2:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_prod_buff_0_0"} : memref<i32>
-// CHECK-DAG:       %[[BUFFER_2_2_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_prod_buff_0_1"} : memref<i32>
-// CHECK-DAG:       %[[BUFFER_2_2_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_prod_buff_0_2"} : memref<i32>
-// CHECK-DAG:       %[[BUFFER_2_2_2:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_prod_buff_0_3"} : memref<i32>
-// CHECK-DAG:       %[[LOCK_2_2:.*]] = aie.lock(%[[TILE_2_2]]) {init = 4 : i8, sym_name = "fifo_prod_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_2_2_3:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "fifo_prod_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_8_3:.*]] = aie.buffer(%[[TILE_8_3]]) {sym_name = "fifo_cons_buff_0_0"} : memref<i32>
-// CHECK-DAG:       %[[BUFFER_8_3_4:.*]] = aie.buffer(%[[TILE_8_3]]) {sym_name = "fifo_cons_buff_0_1"} : memref<i32>
-// CHECK-DAG:       %[[BUFFER_8_3_5:.*]] = aie.buffer(%[[TILE_8_3]]) {sym_name = "fifo_cons_buff_0_2"} : memref<i32>
-// CHECK-DAG:       %[[BUFFER_8_3_6:.*]] = aie.buffer(%[[TILE_8_3]]) {sym_name = "fifo_cons_buff_0_3"} : memref<i32>
-// CHECK-DAG:       %[[LOCK_8_3:.*]] = aie.lock(%[[TILE_8_3]]) {init = 4 : i8, sym_name = "fifo_cons_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_8_3_7:.*]] = aie.lock(%[[TILE_8_3]]) {init = 0 : i8, sym_name = "fifo_cons_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_8_3_8:.*]] = aie.buffer(%[[TILE_8_3]]) {sym_name = "buf83"} : memref<4xi32>
-// CHECK-DAG:       aie.flow(%[[TILE_2_2]], DMA : 0, %[[TILE_8_3]], DMA : 0) {symbol = @fifo}
-// CHECK:           %[[CORE_2_2:.*]] = aie.core(%[[TILE_2_2]]) {
-// CHECK:             %[[C55_I32:.*]] = arith.constant 55 : i32
-// CHECK:             %[[C66_I32:.*]] = arith.constant 66 : i32
-// CHECK:             %[[C77_I32:.*]] = arith.constant 77 : i32
-// CHECK:             %[[C88_I32:.*]] = arith.constant 88 : i32
-// CHECK:             aie.use_lock(%[[LOCK_2_2]], AcquireGreaterEqual, 1)
-// CHECK:             memref.store %[[C55_I32]], %[[BUFFER_2_2]][] : memref<i32>
-// CHECK:             aie.use_lock(%[[LOCK_2_2_3]], Release, 1)
-// CHECK:             aie.use_lock(%[[LOCK_2_2]], AcquireGreaterEqual, 1)
-// CHECK:             memref.store %[[C66_I32]], %[[BUFFER_2_2_0]][] : memref<i32>
-// CHECK:             aie.use_lock(%[[LOCK_2_2_3]], Release, 1)
-// CHECK:             aie.use_lock(%[[LOCK_2_2]], AcquireGreaterEqual, 1)
-// CHECK:             memref.store %[[C77_I32]], %[[BUFFER_2_2_1]][] : memref<i32>
-// CHECK:             aie.use_lock(%[[LOCK_2_2_3]], Release, 1)
-// CHECK:             aie.use_lock(%[[LOCK_2_2]], AcquireGreaterEqual, 1)
-// CHECK:             memref.store %[[C88_I32]], %[[BUFFER_2_2_2]][] : memref<i32>
-// CHECK:             aie.use_lock(%[[LOCK_2_2_3]], Release, 1)
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[CORE_8_3:.*]] = aie.core(%[[TILE_8_3]]) {
-// CHECK:             %[[C0:.*]] = arith.constant 0 : index
-// CHECK:             %[[C1:.*]] = arith.constant 1 : index
-// CHECK:             %[[C2:.*]] = arith.constant 2 : index
-// CHECK:             %[[C3:.*]] = arith.constant 3 : index
-// CHECK:             aie.use_lock(%[[LOCK_8_3_7]], AcquireGreaterEqual, 1)
-// CHECK:             %[[VAL_0:.*]] = memref.load %[[BUFFER_8_3]][] : memref<i32>
-// CHECK:             memref.store %[[VAL_0]], %[[BUFFER_8_3_8]]{{\[}}%[[C0]]] : memref<4xi32>
-// CHECK:             aie.use_lock(%[[LOCK_8_3]], Release, 1)
-// CHECK:             aie.use_lock(%[[LOCK_8_3_7]], AcquireGreaterEqual, 2)
-// CHECK:             %[[VAL_1:.*]] = memref.load %[[BUFFER_8_3_4]][] : memref<i32>
-// CHECK:             %[[VAL_2:.*]] = memref.load %[[BUFFER_8_3_5]][] : memref<i32>
-// CHECK:             memref.store %[[VAL_1]], %[[BUFFER_8_3_8]]{{\[}}%[[C1]]] : memref<4xi32>
-// CHECK:             memref.store %[[VAL_2]], %[[BUFFER_8_3_8]]{{\[}}%[[C2]]] : memref<4xi32>
-// CHECK:             aie.use_lock(%[[LOCK_8_3]], Release, 2)
-// CHECK:             aie.use_lock(%[[LOCK_8_3_7]], AcquireGreaterEqual, 1)
-// CHECK:             %[[VAL_3:.*]] = memref.load %[[BUFFER_8_3_6]][] : memref<i32>
-// CHECK:             memref.store %[[VAL_3]], %[[BUFFER_8_3_8]]{{\[}}%[[C3]]] : memref<4xi32>
-// CHECK:             aie.use_lock(%[[LOCK_8_3]], Release, 1)
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) {
-// CHECK:             %[[VAL_4:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb5)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_2_2_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_2_2]] : memref<i32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_2_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_2_2_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_2_2_0]] : memref<i32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_2_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[LOCK_2_2_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_2_2_1]] : memref<i32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_2_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[LOCK_2_2_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_2_2_2]] : memref<i32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_2_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb5:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_8_3:.*]] = aie.mem(%[[TILE_8_3]]) {
-// CHECK:             %[[VAL_5:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_8_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_8_3]] : memref<i32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_8_3_7]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_8_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_8_3_4]] : memref<i32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_8_3_7]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[LOCK_8_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_8_3_5]] : memref<i32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_8_3_7]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[LOCK_8_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_8_3_6]] : memref<i32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_8_3_7]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb5:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-module @aie2_cyclostatic_dma {
-    aie.device(xcve2302) {
-        %tile22 = aie.tile(2, 2)  // producer tile
-        %tile83 = aie.tile(8, 3)  // consumer tile
-        %buf83  = aie.buffer(%tile83) {sym_name = "buf83"} : memref<4xi32>
-        aie.flow(%tile22, DMA : 0, %tile83, DMA : 0) {symbol = @fifo}
-        // ObjectFifo that can hold 4 memref<i32>s, populated by tile22 and
-        // consumed by tile83
-        aie.objectfifo @fifo (%tile22, {%tile83}, 4 : i32) : !aie.objectfifo<memref<i32>>
-        // Producer core
-        %core22 = aie.core(%tile22) {
-            %c55 = arith.constant 55 : i32
-            %c66 = arith.constant 66 : i32
-            %c77 = arith.constant 77 : i32
-            %c88 = arith.constant 88 : i32
-            // Push 55
-            %subview0 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview<memref<i32>>
-            %subview0_obj = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            memref.store %c55, %subview0_obj[] : memref<i32>
-            aie.objectfifo.release @fifo (Produce, 1)
-            // Push 66
-            %subview1 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview<memref<i32>>
-            %subview1_obj = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            memref.store %c66, %subview1_obj[] : memref<i32>
-            aie.objectfifo.release @fifo (Produce, 1)
-            // Push 77
-            %subview2 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview<memref<i32>>
-            %subview2_obj = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            memref.store %c77, %subview2_obj[] : memref<i32>
-            aie.objectfifo.release @fifo (Produce, 1)
-            // Push 88
-            %subview3 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview<memref<i32>>
-            %subview3_obj = aie.objectfifo.subview.access %subview3[0] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            memref.store %c88, %subview3_obj[] : memref<i32>
-            aie.objectfifo.release @fifo (Produce, 1)
-            aie.end
-        }
-        // Consumer core
-        %core28 = aie.core(%tile83) {
-            // Consumer pattern: {1, 2, 1}
-            %i0 = arith.constant 0 : index
-            %i1 = arith.constant 1 : index
-            %i2 = arith.constant 2 : index
-            %i3 = arith.constant 3 : index
-            // Pop 1 object off queue
-            %subview0 = aie.objectfifo.acquire @fifo (Consume, 1) : !aie.objectfifosubview<memref<i32>>
-            %subview0_obj = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            %v55 = memref.load %subview0_obj[] : memref<i32>
-            memref.store %v55, %buf83[%i0] : memref<4xi32>
-            aie.objectfifo.release @fifo (Consume, 1)
-            // Pop 2 objects off queue
-            %subview1 = aie.objectfifo.acquire @fifo (Consume, 2) : !aie.objectfifosubview<memref<i32>>
-            %subview1_obj0 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            %subview1_obj1 = aie.objectfifo.subview.access %subview1[1] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            %v66 = memref.load %subview1_obj0[] : memref<i32>
-            %v77 = memref.load %subview1_obj1[] : memref<i32>
-            memref.store %v66, %buf83[%i1] : memref<4xi32>
-            memref.store %v77, %buf83[%i2] : memref<4xi32>
-            aie.objectfifo.release @fifo (Consume, 2)
-            // Pop 1 object off queue
-            %subview2 = aie.objectfifo.acquire @fifo (Consume, 1) : !aie.objectfifosubview<memref<i32>>
-            %subview2_obj = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            %v88 = memref.load %subview2_obj[] : memref<i32>
-            memref.store %v88, %buf83[%i3] : memref<4xi32>
-            aie.objectfifo.release @fifo (Consume, 1)
-            aie.end
-        }
-    }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_l1.mlir b/compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_l1.mlir
deleted file mode 100644
index 61091228a..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_l1.mlir
+++ /dev/null
@@ -1,182 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-
-// CHECK-LABEL:   aie.device(xcve2302) {
-// CHECK:           memref.global "public" @fifo : memref<i32>
-// CHECK-DAG:       %[[TILE_2_2:.*]] = aie.tile(2, 2)
-// CHECK-DAG:       %[[TILE_2_3:.*]] = aie.tile(2, 3)
-// CHECK-DAG:       %[[BUFFER_2_2:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_prod_buff_0_0"} : memref<i32>
-// CHECK-DAG:       %[[BUFFER_2_2_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_prod_buff_0_1"} : memref<i32>
-// CHECK-DAG:       %[[BUFFER_2_2_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_prod_buff_0_2"} : memref<i32>
-// CHECK-DAG:       %[[BUFFER_2_2_2:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_prod_buff_0_3"} : memref<i32>
-// CHECK-DAG:       %[[LOCK_2_2:.*]] = aie.lock(%[[TILE_2_2]]) {init = 4 : i8, sym_name = "fifo_prod_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_2_2_3:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "fifo_prod_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_2_3:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "fifo_cons_buff_0_0"} : memref<i32>
-// CHECK-DAG:       %[[BUFFER_2_3_4:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "fifo_cons_buff_0_1"} : memref<i32>
-// CHECK-DAG:       %[[BUFFER_2_3_5:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "fifo_cons_buff_0_2"} : memref<i32>
-// CHECK-DAG:       %[[BUFFER_2_3_6:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "fifo_cons_buff_0_3"} : memref<i32>
-// CHECK-DAG:       %[[LOCK_2_3:.*]] = aie.lock(%[[TILE_2_3]]) {init = 4 : i8, sym_name = "fifo_cons_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_2_3_7:.*]] = aie.lock(%[[TILE_2_3]]) {init = 0 : i8, sym_name = "fifo_cons_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_2_3_8:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "buf23"} : memref<4xi32>
-// CHECK-DAG:       aie.flow(%[[TILE_2_2]], DMA : 0, %[[TILE_2_3]], DMA : 0) {symbol = @fifo}
-// CHECK:           %[[CORE_2_2:.*]] = aie.core(%[[TILE_2_2]]) {
-// CHECK:             %[[C55_I32:.*]] = arith.constant 55 : i32
-// CHECK:             %[[C66_I32:.*]] = arith.constant 66 : i32
-// CHECK:             %[[C77_I32:.*]] = arith.constant 77 : i32
-// CHECK:             %[[C88_I32:.*]] = arith.constant 88 : i32
-// CHECK:             aie.use_lock(%[[LOCK_2_2]], AcquireGreaterEqual, 1)
-// CHECK:             memref.store %[[C55_I32]], %[[BUFFER_2_2]][] : memref<i32>
-// CHECK:             aie.use_lock(%[[LOCK_2_2_3]], Release, 1)
-// CHECK:             aie.use_lock(%[[LOCK_2_2]], AcquireGreaterEqual, 1)
-// CHECK:             memref.store %[[C66_I32]], %[[BUFFER_2_2_0]][] : memref<i32>
-// CHECK:             aie.use_lock(%[[LOCK_2_2_3]], Release, 1)
-// CHECK:             aie.use_lock(%[[LOCK_2_2]], AcquireGreaterEqual, 1)
-// CHECK:             memref.store %[[C77_I32]], %[[BUFFER_2_2_1]][] : memref<i32>
-// CHECK:             aie.use_lock(%[[LOCK_2_2_3]], Release, 1)
-// CHECK:             aie.use_lock(%[[LOCK_2_2]], AcquireGreaterEqual, 1)
-// CHECK:             memref.store %[[C88_I32]], %[[BUFFER_2_2_2]][] : memref<i32>
-// CHECK:             aie.use_lock(%[[LOCK_2_2_3]], Release, 1)
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[CORE_2_3:.*]] = aie.core(%[[TILE_2_3]]) {
-// CHECK:             %[[C0:.*]] = arith.constant 0 : index
-// CHECK:             %[[C1:.*]] = arith.constant 1 : index
-// CHECK:             %[[C2:.*]] = arith.constant 2 : index
-// CHECK:             %[[C3:.*]] = arith.constant 3 : index
-// CHECK:             aie.use_lock(%[[LOCK_2_3_7]], AcquireGreaterEqual, 1)
-// CHECK:             %[[VAL_0:.*]] = memref.load %[[BUFFER_2_3]][] : memref<i32>
-// CHECK:             memref.store %[[VAL_0]], %[[BUFFER_2_3_8]]{{\[}}%[[C0]]] : memref<4xi32>
-// CHECK:             aie.use_lock(%[[LOCK_2_3]], Release, 1)
-// CHECK:             aie.use_lock(%[[LOCK_2_3_7]], AcquireGreaterEqual, 2)
-// CHECK:             %[[VAL_1:.*]] = memref.load %[[BUFFER_2_3_4]][] : memref<i32>
-// CHECK:             %[[VAL_2:.*]] = memref.load %[[BUFFER_2_3_5]][] : memref<i32>
-// CHECK:             memref.store %[[VAL_1]], %[[BUFFER_2_3_8]]{{\[}}%[[C1]]] : memref<4xi32>
-// CHECK:             memref.store %[[VAL_2]], %[[BUFFER_2_3_8]]{{\[}}%[[C2]]] : memref<4xi32>
-// CHECK:             aie.use_lock(%[[LOCK_2_3]], Release, 2)
-// CHECK:             aie.use_lock(%[[LOCK_2_3_7]], AcquireGreaterEqual, 1)
-// CHECK:             %[[VAL_3:.*]] = memref.load %[[BUFFER_2_3_6]][] : memref<i32>
-// CHECK:             memref.store %[[VAL_3]], %[[BUFFER_2_3_8]]{{\[}}%[[C3]]] : memref<4xi32>
-// CHECK:             aie.use_lock(%[[LOCK_2_3]], Release, 1)
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) {
-// CHECK:             %[[VAL_4:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb5)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_2_2_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_2_2]] : memref<i32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_2_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_2_2_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_2_2_0]] : memref<i32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_2_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[LOCK_2_2_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_2_2_1]] : memref<i32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_2_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[LOCK_2_2_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_2_2_2]] : memref<i32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_2_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb5:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_2_3:.*]] = aie.mem(%[[TILE_2_3]]) {
-// CHECK:             %[[VAL_5:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_2_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_2_3]] : memref<i32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_2_3_7]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_2_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_2_3_4]] : memref<i32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_2_3_7]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[LOCK_2_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_2_3_5]] : memref<i32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_2_3_7]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[LOCK_2_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_2_3_6]] : memref<i32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_2_3_7]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb5:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-module @aie2_cyclostatic_l1 {
-    aie.device(xcve2302) {
-        %tile22 = aie.tile(2, 2)  // producer tile
-        %tile23 = aie.tile(2, 3)  // consumer tile
-        %buf23  = aie.buffer(%tile23) {sym_name = "buf23"} : memref<4xi32>
-        aie.flow(%tile22, DMA : 0, %tile23, DMA : 0) {symbol = @fifo}
-        // ObjectFifo that can hold 4 memref<i32>s, populated by tile22 and
-        // consumed by tile23
-        aie.objectfifo @fifo (%tile22, {%tile23}, 4 : i32) : !aie.objectfifo<memref<i32>>
-        // Producer core
-        %core22 = aie.core(%tile22) {
-            %c55 = arith.constant 55 : i32
-            %c66 = arith.constant 66 : i32
-            %c77 = arith.constant 77 : i32
-            %c88 = arith.constant 88 : i32
-            // Push 55
-            %subview0 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview<memref<i32>>
-            %subview0_obj = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            memref.store %c55, %subview0_obj[] : memref<i32>
-            aie.objectfifo.release @fifo (Produce, 1)
-            // Push 66
-            %subview1 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview<memref<i32>>
-            %subview1_obj = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            memref.store %c66, %subview1_obj[] : memref<i32>
-            aie.objectfifo.release @fifo (Produce, 1)
-            // Push 77
-            %subview2 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview<memref<i32>>
-            %subview2_obj = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            memref.store %c77, %subview2_obj[] : memref<i32>
-            aie.objectfifo.release @fifo (Produce, 1)
-            // Push 88
-            %subview3 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview<memref<i32>>
-            %subview3_obj = aie.objectfifo.subview.access %subview3[0] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            memref.store %c88, %subview3_obj[] : memref<i32>
-            aie.objectfifo.release @fifo (Produce, 1)
-            aie.end
-        }
-        // Consumer core
-        %core23 = aie.core(%tile23) {
-            // Consumer pattern: {1, 2, 1}
-            %i0 = arith.constant 0 : index
-            %i1 = arith.constant 1 : index
-            %i2 = arith.constant 2 : index
-            %i3 = arith.constant 3 : index
-            // Pop 1 object off queue
-            %subview0 = aie.objectfifo.acquire @fifo (Consume, 1) : !aie.objectfifosubview<memref<i32>>
-            %subview0_obj = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            %v55 = memref.load %subview0_obj[] : memref<i32>
-            memref.store %v55, %buf23[%i0] : memref<4xi32>
-            aie.objectfifo.release @fifo (Consume, 1)
-            // Pop 2 objects off queue
-            %subview1 = aie.objectfifo.acquire @fifo (Consume, 2) : !aie.objectfifosubview<memref<i32>>
-            %subview1_obj0 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            %subview1_obj1 = aie.objectfifo.subview.access %subview1[1] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            %v66 = memref.load %subview1_obj0[] : memref<i32>
-            %v77 = memref.load %subview1_obj1[] : memref<i32>
-            memref.store %v66, %buf23[%i1] : memref<4xi32>
-            memref.store %v77, %buf23[%i2] : memref<4xi32>
-            aie.objectfifo.release @fifo (Consume, 2)
-            // Pop 1 object off queue
-            %subview2 = aie.objectfifo.acquire @fifo (Consume, 1) : !aie.objectfifosubview<memref<i32>>
-            %subview2_obj = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            %v88 = memref.load %subview2_obj[] : memref<i32>
-            memref.store %v88, %buf23[%i3] : memref<4xi32>
-            aie.objectfifo.release @fifo (Consume, 1)
-            aie.end
-        }
-    }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_l2.mlir b/compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_l2.mlir
deleted file mode 100644
index 9cd02f03c..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/AIE2_cyclostatic_l2.mlir
+++ /dev/null
@@ -1,244 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(xcve2302) {
-// CHECK:           memref.global "public" @fifo1 : memref<1xi32>
-// CHECK:           memref.global "public" @fifo0 : memref<1xi32>
-// CHECK-DAG:       %[[TILE_2_2:.*]] = aie.tile(2, 2)
-// CHECK-DAG:       %[[TILE_2_1:.*]] = aie.tile(2, 1)
-// CHECK-DAG:       %[[TILE_8_3:.*]] = aie.tile(8, 3)
-// CHECK-DAG:       %[[FIFO1_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_8_3]]) {sym_name = "fifo1_cons_buff_0_0"} : memref<1xi32>
-// CHECK-DAG:       %[[FIFO1_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_8_3]]) {sym_name = "fifo1_cons_buff_0_1"} : memref<1xi32>
-// CHECK-DAG:       %[[FIFO1_CONS_BUFF_2:.*]] = aie.buffer(%[[TILE_8_3]]) {sym_name = "fifo1_cons_buff_0_2"} : memref<1xi32>
-// CHECK-DAG:       %[[FIFO1_CONS_BUFF_3:.*]] = aie.buffer(%[[TILE_8_3]]) {sym_name = "fifo1_cons_buff_0_3"} : memref<1xi32>
-// CHECK-DAG:       %[[FIFO1_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_8_3]]) {init = 4 : i8, sym_name = "fifo1_cons_prod_lock_0"}
-// CHECK-DAG:       %[[FIFO1_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_8_3]]) {init = 0 : i8, sym_name = "fifo1_cons_cons_lock_0"}
-// CHECK-DAG:       %[[FIFO0_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "fifo0_link_buff_0_0"} : memref<1xi32>
-// CHECK-DAG:       %[[FIFO0_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "fifo0_link_buff_0_1"} : memref<1xi32>
-// CHECK-DAG:       %[[FIFO0_CONS_BUFF_2:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "fifo0_link_buff_0_2"} : memref<1xi32>
-// CHECK-DAG:       %[[FIFO0_CONS_BUFF_3:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "fifo0_link_buff_0_3"} : memref<1xi32>
-// CHECK-DAG:       %[[FIFO0_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 4 : i8, sym_name = "fifo0_link_prod_lock_0"}
-// CHECK-DAG:       %[[FIFO0_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 0 : i8, sym_name = "fifo0_link_cons_lock_0"}
-// CHECK-DAG:       %[[FIFO0_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo0_prod_buff_0_0"} : memref<1xi32>
-// CHECK-DAG:       %[[FIFO0_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo0_prod_buff_0_1"} : memref<1xi32>
-// CHECK-DAG:       %[[FIFO0_BUFF_2:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo0_prod_buff_0_2"} : memref<1xi32>
-// CHECK-DAG:       %[[FIFO0_BUFF_3:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo0_prod_buff_0_3"} : memref<1xi32>
-// CHECK-DAG:       %[[FIFO0_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 4 : i8, sym_name = "fifo0_prod_prod_lock_0"}
-// CHECK-DAG:       %[[FIFO0_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "fifo0_prod_cons_lock_0"}
-// CHECK-DAG:       %[[BUF83:.*]] = aie.buffer(%[[TILE_8_3]]) {sym_name = "buf83"} : memref<1xi32>
-// CHECK-DAG:       aie.flow(%[[TILE_2_2]], DMA : 0, %[[TILE_2_1]], DMA : 0)
-// CHECK-DAG:       aie.flow(%[[TILE_2_1]], DMA : 0, %[[TILE_8_3]], DMA : 0)
-// CHECK:           %[[CORE_2_2:.*]] = aie.core(%[[TILE_2_2]]) {
-// CHECK:             %[[C0:.*]] = arith.constant 0 : index
-// CHECK:             %[[C55_I32:.*]] = arith.constant 55 : i32
-// CHECK:             %[[C66_I32:.*]] = arith.constant 66 : i32
-// CHECK:             %[[C77_I32:.*]] = arith.constant 77 : i32
-// CHECK:             %[[C88_I32:.*]] = arith.constant 88 : i32
-// CHECK:             aie.use_lock(%[[FIFO0_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             memref.store %[[C55_I32]], %[[FIFO0_BUFF_0]]{{\[}}%[[C0]]] : memref<1xi32>
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_LOCK]], Release, 1)
-// CHECK:             aie.use_lock(%[[FIFO0_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             memref.store %[[C66_I32]], %[[FIFO0_BUFF_1]]{{\[}}%[[C0]]] : memref<1xi32>
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_LOCK]], Release, 1)
-// CHECK:             aie.use_lock(%[[FIFO0_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             memref.store %[[C77_I32]], %[[FIFO0_BUFF_2]]{{\[}}%[[C0]]] : memref<1xi32>
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_LOCK]], Release, 1)
-// CHECK:             aie.use_lock(%[[FIFO0_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             memref.store %[[C88_I32]], %[[FIFO0_BUFF_3]]{{\[}}%[[C0]]] : memref<1xi32>
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_LOCK]], Release, 1)
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[CORE_8_3:.*]] = aie.core(%[[TILE_8_3]]) {
-// CHECK:             %[[C0:.*]] = arith.constant 0 : index
-// CHECK:             %[[C1:.*]] = arith.constant 1 : index
-// CHECK:             %[[C2:.*]] = arith.constant 2 : index
-// CHECK:             %[[C3:.*]] = arith.constant 3 : index
-// CHECK:             aie.use_lock(%[[FIFO1_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             %[[VAL_0:.*]] = memref.load %[[FIFO1_CONS_BUFF_0]]{{\[}}%[[C0]]] : memref<1xi32>
-// CHECK:             memref.store %[[VAL_0]], %[[BUF83]]{{\[}}%[[C0]]] : memref<1xi32>
-// CHECK:             aie.use_lock(%[[FIFO1_CONS_PROD_LOCK]], Release, 1)
-// CHECK:             aie.use_lock(%[[FIFO1_CONS_CONS_LOCK]], AcquireGreaterEqual, 2)
-// CHECK:             %[[VAL_1:.*]] = memref.load %[[FIFO1_CONS_BUFF_1]]{{\[}}%[[C0]]] : memref<1xi32>
-// CHECK:             %[[VAL_2:.*]] = memref.load %[[FIFO1_CONS_BUFF_2]]{{\[}}%[[C0]]] : memref<1xi32>
-// CHECK:             memref.store %[[VAL_1]], %[[BUF83]]{{\[}}%[[C1]]] : memref<1xi32>
-// CHECK:             memref.store %[[VAL_2]], %[[BUF83]]{{\[}}%[[C2]]] : memref<1xi32>
-// CHECK:             aie.use_lock(%[[FIFO1_CONS_PROD_LOCK]], Release, 2)
-// CHECK:             aie.use_lock(%[[FIFO1_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             %[[VAL_3:.*]] = memref.load %[[FIFO1_CONS_BUFF_3]]{{\[}}%[[C0]]] : memref<1xi32>
-// CHECK:             memref.store %[[VAL_3]], %[[BUF83]]{{\[}}%[[C3]]] : memref<1xi32>
-// CHECK:             aie.use_lock(%[[FIFO1_CONS_PROD_LOCK]], Release, 1)
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) {
-// CHECK:             %[[VAL_4:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb5)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[FIFO0_BUFF_0]] : memref<1xi32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[FIFO0_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[FIFO0_BUFF_1]] : memref<1xi32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[FIFO0_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:            ^bb3:
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[FIFO0_BUFF_2]] : memref<1xi32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[FIFO0_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[FIFO0_BUFF_3]] : memref<1xi32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[FIFO0_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb5:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEMTILE_DMA_2_1:.*]] = aie.memtile_dma(%[[TILE_2_1]]) {
-// CHECK:             %[[VAL_5:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[FIFO0_CONS_BUFF_0]] : memref<1xi32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[FIFO0_CONS_BUFF_1]] : memref<1xi32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[FIFO0_CONS_BUFF_2]] : memref<1xi32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[FIFO0_CONS_BUFF_3]] : memref<1xi32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb5:
-// CHECK:             %[[VAL_6:.*]] = aie.dma_start(MM2S, 0, ^bb6, ^bb10)
-// CHECK:           ^bb6:
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[FIFO0_CONS_BUFF_0]] : memref<1xi32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb7
-// CHECK:           ^bb7:
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[FIFO0_CONS_BUFF_1]] : memref<1xi32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb8
-// CHECK:           ^bb8:
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[FIFO0_CONS_BUFF_2]] : memref<1xi32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb9
-// CHECK:           ^bb9:
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[FIFO0_CONS_BUFF_3]] : memref<1xi32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[FIFO0_CONS_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb6
-// CHECK:           ^bb10:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_8_3:.*]] = aie.mem(%[[TILE_8_3]]) {
-// CHECK:             %[[VAL_7:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[FIFO1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[FIFO1_CONS_BUFF_0]] : memref<1xi32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[FIFO1_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[FIFO1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[FIFO1_CONS_BUFF_1]] : memref<1xi32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[FIFO1_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[FIFO1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[FIFO1_CONS_BUFF_2]] : memref<1xi32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[FIFO1_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[FIFO1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[FIFO1_CONS_BUFF_3]] : memref<1xi32>) {len = 1 : i32}
-// CHECK:             aie.use_lock(%[[FIFO1_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb5:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-
-module @aie2_cyclostatic_l2 {
-    aie.device(xcve2302) {
-        %tile22 = aie.tile(2, 2)  // producer tile
-        %memtile = aie.tile(2, 1) // mem tile
-        %tile83 = aie.tile(8, 3)  // consumer tile
-        %buf83  = aie.buffer(%tile83) {sym_name = "buf83"} : memref<1xi32>
-        aie.flow(%tile22, DMA : 0, %memtile, DMA : 0) {symbol = @fifo0}
-        aie.flow(%memtile, DMA : 0, %tile83, DMA : 0) {symbol = @fifo1}
-        // ObjectFifo that can hold 4 memref<1xi32>s, populated by tile22 and
-        // consumed by tile23
-        aie.objectfifo @fifo0 (%tile22, {%memtile}, 4 : i32) : !aie.objectfifo<memref<1xi32>>
-        aie.objectfifo @fifo1 (%memtile, {%tile83}, [4, 4]) : !aie.objectfifo<memref<1xi32>>
-        aie.objectfifo.link [@fifo0] -> [@fifo1] ([] [])
-        // Producer core
-        %core22 = aie.core(%tile22) {
-            %i0 = arith.constant 0 : index
-            %c55 = arith.constant 55 : i32
-            %c66 = arith.constant 66 : i32
-            %c77 = arith.constant 77 : i32
-            %c88 = arith.constant 88 : i32
-            // Push 55
-            %subview0 = aie.objectfifo.acquire @fifo0 (Produce, 1) : !aie.objectfifosubview<memref<1xi32>>
-            %subview0_obj = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview<memref<1xi32>> -> memref<1xi32>
-            memref.store %c55, %subview0_obj[%i0] : memref<1xi32>
-            aie.objectfifo.release @fifo0 (Produce, 1)
-            // Push 66
-            %subview1 = aie.objectfifo.acquire @fifo0 (Produce, 1) : !aie.objectfifosubview<memref<1xi32>>
-            %subview1_obj = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<1xi32>> -> memref<1xi32>
-            memref.store %c66, %subview1_obj[%i0] : memref<1xi32>
-            aie.objectfifo.release @fifo0 (Produce, 1)
-            // Push 77
-            %subview2 = aie.objectfifo.acquire @fifo0 (Produce, 1) : !aie.objectfifosubview<memref<1xi32>>
-            %subview2_obj = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview<memref<1xi32>> -> memref<1xi32>
-            memref.store %c77, %subview2_obj[%i0] : memref<1xi32>
-            aie.objectfifo.release @fifo0 (Produce, 1)
-            // Push 88
-            %subview3 = aie.objectfifo.acquire @fifo0 (Produce, 1) : !aie.objectfifosubview<memref<1xi32>>
-            %subview3_obj = aie.objectfifo.subview.access %subview3[0] : !aie.objectfifosubview<memref<1xi32>> -> memref<1xi32>
-            memref.store %c88, %subview3_obj[%i0] : memref<1xi32>
-            aie.objectfifo.release @fifo0 (Produce, 1)
-            aie.end
-        }
-        // Consumer core
-        %core28 = aie.core(%tile83) {
-            // Consumer pattern: {1, 2, 1}
-            %i0 = arith.constant 0 : index
-            %i1 = arith.constant 1 : index
-            %i2 = arith.constant 2 : index
-            %i3 = arith.constant 3 : index
-            // Pop 1 object off queue
-            %subview0 = aie.objectfifo.acquire @fifo1 (Consume, 1) : !aie.objectfifosubview<memref<1xi32>>
-            %subview0_obj = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview<memref<1xi32>> -> memref<1xi32>
-            %v55 = memref.load %subview0_obj[%i0] : memref<1xi32>
-            memref.store %v55, %buf83[%i0] : memref<1xi32>
-            aie.objectfifo.release @fifo1 (Consume, 1)
-            // Pop 2 objects off queue
-            %subview1 = aie.objectfifo.acquire @fifo1 (Consume, 2) : !aie.objectfifosubview<memref<1xi32>>
-            %subview1_obj0 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<1xi32>> -> memref<1xi32>
-            %subview1_obj1 = aie.objectfifo.subview.access %subview1[1] : !aie.objectfifosubview<memref<1xi32>> -> memref<1xi32>
-            %v66 = memref.load %subview1_obj0[%i0] : memref<1xi32>
-            %v77 = memref.load %subview1_obj1[%i0] : memref<1xi32>
-            memref.store %v66, %buf83[%i1] : memref<1xi32>
-            memref.store %v77, %buf83[%i2] : memref<1xi32>
-            aie.objectfifo.release @fifo1 (Consume, 2)
-            // Pop 1 object off queue
-            %subview2 = aie.objectfifo.acquire @fifo1 (Consume, 1) : !aie.objectfifosubview<memref<1xi32>>
-            %subview2_obj = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview<memref<1xi32>> -> memref<1xi32>
-            %v88 = memref.load %subview2_obj[%i0] : memref<1xi32>
-            memref.store %v88, %buf83[%i3] : memref<1xi32>
-            aie.objectfifo.release @fifo1 (Consume, 1)
-            aie.end
-        }
-    }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/AIE2_delayed_release.mlir b/compiler/plugins/target/AMD-AIE/aie/test/AIE2_delayed_release.mlir
deleted file mode 100644
index 902ae6250..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/AIE2_delayed_release.mlir
+++ /dev/null
@@ -1,125 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// Tests objectFifo between cores, xfailing for now.
-// XFAIL: *
-// CHECK-LABEL:   aie.device(xcve2302) {
-// CHECK:           memref.global "public" @fifo : memref<i32>
-// CHECK:           %[[TILE_2_2:.*]] = aie.tile(2, 2)
-// CHECK:           %[[TILE_2_3:.*]] = aie.tile(2, 3)
-// CHECK:           %[[FIFO_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_buff_0"} : memref<i32>
-// CHECK:           %[[FIFO_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_buff_1"} : memref<i32>
-// CHECK:           %[[FIFO_BUFF_2:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_buff_2"} : memref<i32>
-// CHECK:           %[[FIFO_BUFF_3:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "fifo_buff_3"} : memref<i32>
-// CHECK:           %[[FIFO_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 4 : i8, sym_name = "fifo_prod_lock"}
-// CHECK:           %[[FIFO_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "fifo_cons_lock"}
-// CHECK:           %[[BUF23:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "buf23"} : memref<4xi32>
-// CHECK:           %[[CORE_2_2:.*]] = aie.core(%[[TILE_2_2]]) {
-// CHECK:             %[[C99_I32:.*]] = arith.constant 99 : i32
-// CHECK:             %[[C0:.*]] = arith.constant 0 : index
-// CHECK:             %[[C1:.*]] = arith.constant 1 : index
-// CHECK:             %[[C4:.*]] = arith.constant 4 : index
-// CHECK:             aie.use_lock(%[[FIFO_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             memref.store %[[C99_I32]], %[[FIFO_BUFF_0]][] : memref<i32>
-// CHECK:             aie.use_lock(%[[FIFO_CONS_LOCK]], Release, 1)
-// CHECK:             aie.use_lock(%[[FIFO_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             memref.store %[[C99_I32]], %[[FIFO_BUFF_1]][] : memref<i32>
-// CHECK:             aie.use_lock(%[[FIFO_CONS_LOCK]], Release, 1)
-// CHECK:             aie.use_lock(%[[FIFO_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             memref.store %[[C99_I32]], %[[FIFO_BUFF_2]][] : memref<i32>
-// CHECK:             aie.use_lock(%[[FIFO_CONS_LOCK]], Release, 1)
-// CHECK:             aie.use_lock(%[[FIFO_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             memref.store %[[C99_I32]], %[[FIFO_BUFF_3]][] : memref<i32>
-// CHECK:             aie.use_lock(%[[FIFO_CONS_LOCK]], Release, 1)
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[CORE_2_3:.*]] = aie.core(%[[TILE_2_3]]) {
-// CHECK:             %[[C0:.*]] = arith.constant 0 : index
-// CHECK:             %[[C1:.*]] = arith.constant 1 : index
-// CHECK:             %[[C2:.*]] = arith.constant 2 : index
-// CHECK:             %[[C3:.*]] = arith.constant 3 : index
-// CHECK:             aie.use_lock(%[[FIFO_CONS_LOCK]], AcquireGreaterEqual, 2)
-// CHECK:             %[[VAL_0:.*]] = memref.load %[[FIFO_BUFF_0]][] : memref<i32>
-// CHECK:             memref.store %[[VAL_0]], %[[BUF23]]{{\[}}%[[C0]]] : memref<4xi32>
-// CHECK:             %[[VAL_1:.*]] = memref.load %[[FIFO_BUFF_0]][] : memref<i32>
-// CHECK:             memref.store %[[VAL_1]], %[[BUF23]]{{\[}}%[[C1]]] : memref<4xi32>
-// CHECK:             aie.use_lock(%[[FIFO_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             %[[VAL_2:.*]] = memref.load %[[FIFO_BUFF_0]][] : memref<i32>
-// CHECK:             memref.store %[[VAL_2]], %[[BUF23]]{{\[}}%[[C2]]] : memref<4xi32>
-// CHECK:             %[[VAL_3:.*]] = memref.load %[[FIFO_BUFF_0]][] : memref<i32>
-// CHECK:             memref.store %[[VAL_3]], %[[BUF23]]{{\[}}%[[C3]]] : memref<4xi32>
-// CHECK:             aie.use_lock(%[[FIFO_PROD_LOCK]], Release, 3)
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-module @AIE2_delayed_release {
-    aie.device(xcve2302) {
-        %tile22 = aie.tile(2, 2)
-        %tile23 = aie.tile(2, 3)
-        %buf23 = aie.buffer(%tile23) {sym_name = "buf23"} : memref<4xi32>
-        aie.flow(%tile22, DMA : 0, %tile23, DMA : 0) {symbol = @fifo}
-        aie.objectfifo @fifo (%tile22, {%tile23}, 4 : i32) : !aie.objectfifo<memref<i32>>
-        // Producer -- produces one element at a time
-        %core22 = aie.core(%tile22) {
-            %c99 = arith.constant 99 : i32
-            %i0 = arith.constant 0 : index
-            %i1 = arith.constant 1 : index
-            %i4 = arith.constant 4 : index
-            // Produce one 1 element (acquire producer lock) ...
-            %subview0 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview<memref<i32>>
-            %subview_obj0 = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            memref.store %c99, %subview_obj0[] : memref<i32>
-            aie.objectfifo.release @fifo (Produce, 1)
-            // ... done producing (release consumer lock)
-            // Produce one 1 element (acquire producer lock) ...
-            %subview1 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview<memref<i32>>
-            %subview_obj1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            memref.store %c99, %subview_obj1[] : memref<i32>
-            aie.objectfifo.release @fifo (Produce, 1)
-            // ... done producing (release consumer lock)
-            // Produce one 1 element (acquire producer lock) ...
-            %subview2 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview<memref<i32>>
-            %subview_obj2 = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            memref.store %c99, %subview_obj2[] : memref<i32>
-            aie.objectfifo.release @fifo (Produce, 1)
-            // ... done producing (release consumer lock)
-            // Produce one 1 element (acquire producer lock) ...
-            %subview3 = aie.objectfifo.acquire @fifo (Produce, 1) : !aie.objectfifosubview<memref<i32>>
-            %subview_obj3 = aie.objectfifo.subview.access %subview3[0] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            memref.store %c99, %subview_obj3[] : memref<i32>
-            aie.objectfifo.release @fifo (Produce, 1)
-            // ... done producing (release consumer lock)
-            aie.end
-        }
-        // Consumer -- consumes {2, 1, 3, 1}; releases {0, 0, 0, 2}
-        %core23 = aie.core(%tile23) {
-            %i0 = arith.constant 0 : index
-            %i1 = arith.constant 1 : index
-            %i2 = arith.constant 2 : index
-            %i3 = arith.constant 3 : index
-            // Begin consuming 2 elements (acquire consumer lock with value 2)
-            %subview0 = aie.objectfifo.acquire @fifo (Consume, 2) : !aie.objectfifosubview<memref<i32>>
-            %subview0_obj = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            %v0 = memref.load %subview0_obj[] : memref<i32>
-            memref.store %v0, %buf23[%i0] : memref<4xi32>
-            // For the next step, we only need one element (this could be a subroutine that acquires 1, not knowing that we already acquired 2)
-            %subview1 = aie.objectfifo.acquire @fifo (Consume, 1) : !aie.objectfifosubview<memref<i32>>
-            %subview1_obj = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            %v1 = memref.load %subview1_obj[] : memref<i32>
-            memref.store %v1, %buf23[%i1] : memref<4xi32>
-            // Actually, give us the two from before and one more for three objects total (consumer lock should increase by one)
-            %subview2 = aie.objectfifo.acquire @fifo (Consume, 3) : !aie.objectfifosubview<memref<i32>>
-            %subview2_obj = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            %v2 = memref.load %subview2_obj[] : memref<i32>
-            memref.store %v2, %buf23[%i2] : memref<4xi32>
-            // Now let's just work on one element (consumer lock should not change value)
-            %subview3 = aie.objectfifo.acquire @fifo (Consume, 1) : !aie.objectfifosubview<memref<i32>>
-            %subview3_obj = aie.objectfifo.subview.access %subview3[0] : !aie.objectfifosubview<memref<i32>> -> memref<i32>
-            %v3 = memref.load %subview3_obj[] : memref<i32>
-            memref.store %v3, %buf23[%i3] : memref<4xi32>
-            // Done, let's release everything we hold (we hold 3 objects from our max acquire)
-            aie.objectfifo.release @fifo (Consume, 3)
-            aie.end
-        }
-    }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/assign-lockIDs.mlir b/compiler/plugins/target/AMD-AIE/aie/test/assign-lockIDs.mlir
deleted file mode 100644
index 3bcf4ce7b..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/assign-lockIDs.mlir
+++ /dev/null
@@ -1,129 +0,0 @@
-
-// RUN: iree-opt --amdaie-assign-lock-ids --split-input-file %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(xcvc1902) {
-// CHECK:           %[[TILE_2_2:.*]] = aie.tile(2, 2)
-// CHECK:           %[[TILE_2_3:.*]] = aie.tile(2, 3)
-// CHECK:           %[[TILE_3_3:.*]] = aie.tile(3, 3)
-// CHECK:           %[[TILE_3_4:.*]] = aie.tile(3, 4)
-// CHECK:           %[[LOCK_2_2:.*]] = aie.lock(%[[TILE_2_2]], 0)
-// CHECK:           %[[LOCK_2_2_0:.*]] = aie.lock(%[[TILE_2_2]], 2)
-// CHECK:           %[[LOCK_2_2_1:.*]] = aie.lock(%[[TILE_2_2]], 1)
-// CHECK:           %[[LOCK_2_3:.*]] = aie.lock(%[[TILE_2_3]], 0)
-// CHECK:           %[[LOCK_2_3_2:.*]] = aie.lock(%[[TILE_2_3]], 1)
-// CHECK:           %[[LOCK_2_3_3:.*]] = aie.lock(%[[TILE_2_3]], 4)
-// CHECK:           %[[LOCK_2_3_4:.*]] = aie.lock(%[[TILE_2_3]], 2)
-// CHECK:           %[[LOCK_2_3_5:.*]] = aie.lock(%[[TILE_2_3]], 3)
-// CHECK:           %[[LOCK_2_3_6:.*]] = aie.lock(%[[TILE_2_3]], 5)
-// CHECK:           %[[LOCK_2_3_7:.*]] = aie.lock(%[[TILE_2_3]], 6)
-// CHECK:           %[[LOCK_2_3_8:.*]] = aie.lock(%[[TILE_2_3]], 7)
-// CHECK:           %[[LOCK_2_3_9:.*]] = aie.lock(%[[TILE_2_3]], 10)
-// CHECK:           %[[LOCK_2_3_10:.*]] = aie.lock(%[[TILE_2_3]], 11)
-// CHECK:           %[[LOCK_2_3_11:.*]] = aie.lock(%[[TILE_2_3]], 8)
-// CHECK:           %[[LOCK_2_3_12:.*]] = aie.lock(%[[TILE_2_3]], 9)
-// CHECK:           %[[LOCK_2_3_13:.*]] = aie.lock(%[[TILE_2_3]], 12)
-// CHECK:           %[[LOCK_2_3_14:.*]] = aie.lock(%[[TILE_2_3]], 13)
-// CHECK:           %[[LOCK_2_3_15:.*]] = aie.lock(%[[TILE_2_3]], 14)
-// CHECK:           %[[LOCK_2_3_16:.*]] = aie.lock(%[[TILE_2_3]], 15)
-// CHECK:           %[[LOCK_3_3:.*]] = aie.lock(%[[TILE_3_3]], 0)
-// CHECK:           %[[LOCK_3_3_17:.*]] = aie.lock(%[[TILE_3_3]], 1)
-// CHECK:           %[[LOCK_3_3_18:.*]] = aie.lock(%[[TILE_3_3]], 9)
-// CHECK:           %[[LOCK_3_3_19:.*]] = aie.lock(%[[TILE_3_3]], 2)
-// CHECK:           %[[LOCK_3_4:.*]] = aie.lock(%[[TILE_3_4]], 0)
-// CHECK:           %[[LOCK_3_4_20:.*]] = aie.lock(%[[TILE_3_4]], 1)
-// CHECK:           %[[LOCK_3_4_21:.*]] = aie.lock(%[[TILE_3_4]], 2)
-// CHECK:           %[[LOCK_3_4_22:.*]] = aie.lock(%[[TILE_3_4]], 3)
-// CHECK:           %[[TILE_6_0:.*]] = aie.tile(6, 0)
-// CHECK:           %[[LOCK_6_0:.*]] = aie.lock(%[[TILE_6_0]], 0)
-// CHECK:         }
-
-module @test_assign_lockIDs {
- aie.device(xcvc1902) {
-  %t22 = aie.tile(2, 2)
-  %t23 = aie.tile(2, 3)
-  %t33 = aie.tile(3, 3)
-  %t34 = aie.tile(3, 4)
-  %l22_0 = aie.lock(%t22, 0)
-  %l22_2 = aie.lock(%t22, 2)
-  %l22_1 = aie.lock(%t22)
-  %l23_0 = aie.lock(%t23)
-  %l23_1 = aie.lock(%t23)
-  %l23_4 = aie.lock(%t23, 4)
-  %l23_2 = aie.lock(%t23)
-  %l23_3 = aie.lock(%t23)
-  %l23_5 = aie.lock(%t23)
-  %l23_6 = aie.lock(%t23)
-  %l23_7 = aie.lock(%t23)
-  %l23_10 = aie.lock(%t23)
-  %l23_11 = aie.lock(%t23)
-  %l23_8 = aie.lock(%t23, 8)
-  %l23_9 = aie.lock(%t23, 9)
-  %l23_12 = aie.lock(%t23)
-  %l23_13 = aie.lock(%t23)
-  %l23_14 = aie.lock(%t23)
-  %l23_15 = aie.lock(%t23)
-  %l33_0 = aie.lock(%t33, 0)
-  %l33_1 = aie.lock(%t33)
-  %l33_9 = aie.lock(%t33, 9)
-  %l33_2 = aie.lock(%t33)
-  %l34_0 = aie.lock(%t34)
-  %l34_1 = aie.lock(%t34)
-  %l34_2 = aie.lock(%t34)
-  %l34_3 = aie.lock(%t34)
-  %t60 = aie.tile(6, 0)
-  %l60 = aie.lock(%t60)
- }
-}
-
-// -----
-
-// CHECK-LABEL:   aie.device(xcve2802) {
-// CHECK:           %[[TILE_1_1:.*]] = aie.tile(1, 1)
-// CHECK:           %[[LOCK_1_1:.*]] = aie.lock(%[[TILE_1_1]], 1)
-// CHECK:           %[[LOCK_1_1_0:.*]] = aie.lock(%[[TILE_1_1]], 0)
-// CHECK:           %[[LOCK_1_1_1:.*]] = aie.lock(%[[TILE_1_1]], 3)
-// CHECK:           %[[LOCK_1_1_2:.*]] = aie.lock(%[[TILE_1_1]], 4)
-// CHECK:           %[[LOCK_1_1_3:.*]] = aie.lock(%[[TILE_1_1]], 5)
-// CHECK:           %[[LOCK_1_1_4:.*]] = aie.lock(%[[TILE_1_1]], 6)
-// CHECK:           %[[LOCK_1_1_5:.*]] = aie.lock(%[[TILE_1_1]], 7)
-// CHECK:           %[[LOCK_1_1_6:.*]] = aie.lock(%[[TILE_1_1]], 8)
-// CHECK:           %[[LOCK_1_1_7:.*]] = aie.lock(%[[TILE_1_1]], 9)
-// CHECK:           %[[LOCK_1_1_8:.*]] = aie.lock(%[[TILE_1_1]], 10)
-// CHECK:           %[[LOCK_1_1_9:.*]] = aie.lock(%[[TILE_1_1]], 11)
-// CHECK:           %[[LOCK_1_1_10:.*]] = aie.lock(%[[TILE_1_1]], 12)
-// CHECK:           %[[LOCK_1_1_11:.*]] = aie.lock(%[[TILE_1_1]], 13)
-// CHECK:           %[[LOCK_1_1_12:.*]] = aie.lock(%[[TILE_1_1]], 14)
-// CHECK:           %[[LOCK_1_1_13:.*]] = aie.lock(%[[TILE_1_1]], 33)
-// CHECK:           %[[LOCK_1_1_14:.*]] = aie.lock(%[[TILE_1_1]], 15)
-// CHECK:           %[[LOCK_1_1_15:.*]] = aie.lock(%[[TILE_1_1]], 16)
-// CHECK:           %[[LOCK_1_1_16:.*]] = aie.lock(%[[TILE_1_1]], 17)
-// CHECK:           %[[LOCK_1_1_17:.*]] = aie.lock(%[[TILE_1_1]], 18)
-// CHECK:           %[[LOCK_1_1_18:.*]] = aie.lock(%[[TILE_1_1]], 2)
-// CHECK:         }
-
-module @memTileTest {
-  aie.device(xcve2802) {
-    // Memory tiles on xcve have 64 locks.
-    %tmemtile = aie.tile(1,1)
-    %l0 = aie.lock(%tmemtile, 1)
-    %l1 = aie.lock(%tmemtile, 0)
-    %l2 = aie.lock(%tmemtile)
-    %l3 = aie.lock(%tmemtile)
-    %l4 = aie.lock(%tmemtile)
-    %l5 = aie.lock(%tmemtile)
-    %l6 = aie.lock(%tmemtile)
-    %l7 = aie.lock(%tmemtile)
-    %l8 = aie.lock(%tmemtile)
-    %l9 = aie.lock(%tmemtile)
-    %l10 = aie.lock(%tmemtile)
-    %l11 = aie.lock(%tmemtile)
-    %l12 = aie.lock(%tmemtile)
-    %l13 = aie.lock(%tmemtile)
-    %l14 = aie.lock(%tmemtile,33)
-    %l15 = aie.lock(%tmemtile)
-    %l16 = aie.lock(%tmemtile)
-    %l17 = aie.lock(%tmemtile)
-    %l18 = aie.lock(%tmemtile)
-    %l19 = aie.lock(%tmemtile,2)
-  }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/base_test_AIE1.mlir b/compiler/plugins/target/AMD-AIE/aie/test/base_test_AIE1.mlir
deleted file mode 100644
index 205bfeea0..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/base_test_AIE1.mlir
+++ /dev/null
@@ -1,123 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(npu1_4col) {
-// CHECK:           memref.global "public" @of1 : memref<16xi32>
-// CHECK:           memref.global "public" @of0 : memref<16xi32>
-// CHECK-DAG:       %[[TILE_1_2:.*]] = aie.tile(1, 2)
-// CHECK-DAG:       %[[TILE_1_3:.*]] = aie.tile(1, 3)
-// CHECK-DAG:       %[[TILE_3_3:.*]] = aie.tile(3, 3)
-// CHECK-DAG:       %[[BUFFER_1_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of1_prod_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of1_prod_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_1_2:.*]] = aie.lock(%[[TILE_1_2]]) {init = 2 : i8, sym_name = "of1_prod_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_1_2_1:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of1_prod_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_3_3:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of1_cons_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_3_3_2:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of1_cons_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_3_3:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "of1_cons_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_3_3_3:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "of1_cons_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_1_2_4:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_5:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_6:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_2"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_7:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_3"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_1_2_8:.*]] = aie.lock(%[[TILE_1_2]]) {init = 4 : i8, sym_name = "of0_prod_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_1_2_9:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of0_prod_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_1_3:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_3_10:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_3_11:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_2"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_3_12:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_3"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_1_3:.*]] = aie.lock(%[[TILE_1_3]]) {init = 4 : i8, sym_name = "of0_cons_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_1_3_13:.*]] = aie.lock(%[[TILE_1_3]]) {init = 0 : i8, sym_name = "of0_cons_cons_lock_0"}
-// CHECK-DAG:       aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_3_3]], DMA : 0) {symbol = @of1}
-// CHECK-DAG:       aie.flow(%[[TILE_1_2]], DMA : 1, %[[TILE_1_3]], DMA : 0) {symbol = @of0}
-// CHECK:           %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) {
-// CHECK:             %[[VAL_0:.*]] = aie.dma_start(MM2S, 1, ^bb1, ^bb5)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_9]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_4]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2_8]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_9]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_5]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2_8]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_9]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_6]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2_8]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_9]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_7]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2_8]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb5:
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(MM2S, 0, ^bb6, ^bb8)
-// CHECK:           ^bb6:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_1]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb7
-// CHECK:           ^bb7:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_1]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb6
-// CHECK:           ^bb8:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_1_3:.*]] = aie.mem(%[[TILE_1_3]]) {
-// CHECK:             %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_3]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_3_13]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_3_10]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_3_13]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_3_11]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_3_13]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_3_12]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_3_13]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb5:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) {
-// CHECK:             %[[VAL_3:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_3_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_3_3]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_3_3_3]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_3_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_3_3_2]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_3_3_3]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-module @elementGenerationAIE1 {
-   aie.device(npu1_4col) {
-      %tile12 = aie.tile(1, 2)
-      %tile13 = aie.tile(1, 3)
-      %tile33 = aie.tile(3, 3)
-      aie.flow(%tile12, DMA : 0, %tile33, DMA : 0) {symbol = @of1}
-      aie.flow(%tile12, DMA : 1, %tile13, DMA : 0) {symbol = @of0}
-      // In the shared memory case, the number of elements does not change.
-      aie.objectfifo @of0 (%tile12, {%tile13}, 4 : i32) : !aie.objectfifo<memref<16xi32>>
-      // In the non-adjacent memory case, the number of elements depends on the max amount acquired by
-      // the processes running on each core (here nothing is specified so it cannot be derived).
-      aie.objectfifo @of1 (%tile12, {%tile33}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
-   }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/base_test_AIE2.mlir b/compiler/plugins/target/AMD-AIE/aie/test/base_test_AIE2.mlir
deleted file mode 100644
index 160c2b596..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/base_test_AIE2.mlir
+++ /dev/null
@@ -1,123 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(xcve2302) {
-// CHECK:           memref.global "public" @of1 : memref<16xi32>
-// CHECK:           memref.global "public" @of0 : memref<16xi32>
-// CHECK-DAG:       %[[TILE_1_2:.*]] = aie.tile(1, 2)
-// CHECK-DAG:       %[[TILE_1_3:.*]] = aie.tile(1, 3)
-// CHECK-DAG:       %[[TILE_3_3:.*]] = aie.tile(3, 3)
-// CHECK-DAG:       %[[BUFFER_1_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of1_prod_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of1_prod_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_1_2:.*]] = aie.lock(%[[TILE_1_2]]) {init = 2 : i8, sym_name = "of1_prod_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_1_2_1:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of1_prod_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_3_3:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of1_cons_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_3_3_2:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of1_cons_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_3_3:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "of1_cons_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_3_3_3:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "of1_cons_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_1_2_4:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_5:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_6:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_2"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_7:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_3"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_1_2_8:.*]] = aie.lock(%[[TILE_1_2]]) {init = 4 : i8, sym_name = "of0_prod_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_1_2_9:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of0_prod_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_1_3:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_3_10:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_3_11:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_2"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_3_12:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_3"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_1_3:.*]] = aie.lock(%[[TILE_1_3]]) {init = 4 : i8, sym_name = "of0_cons_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_1_3_13:.*]] = aie.lock(%[[TILE_1_3]]) {init = 0 : i8, sym_name = "of0_cons_cons_lock_0"}
-// CHECK-DAG:       aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_3_3]], DMA : 0) {symbol = @of1}
-// CHECK-DAG:       aie.flow(%[[TILE_1_2]], DMA : 1, %[[TILE_1_3]], DMA : 0) {symbol = @of0}
-// CHECK:           %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) {
-// CHECK:             %[[VAL_0:.*]] = aie.dma_start(MM2S, 1, ^bb1, ^bb5)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_9]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_4]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2_8]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_9]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_5]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2_8]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_9]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_6]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2_8]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_9]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_7]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2_8]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb5:
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(MM2S, 0, ^bb6, ^bb8)
-// CHECK:           ^bb6:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_1]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb7
-// CHECK:           ^bb7:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_1]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb6
-// CHECK:           ^bb8:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_1_3:.*]] = aie.mem(%[[TILE_1_3]]) {
-// CHECK:             %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_3]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_3_13]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_3_10]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_3_13]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_3_11]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_3_13]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_3_12]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_3_13]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb5:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) {
-// CHECK:             %[[VAL_3:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_3_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_3_3]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_3_3_3]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_3_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_3_3_2]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_3_3_3]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-module @elementGenerationAIE2 {
-  aie.device(xcve2302) {
-    %tile12 = aie.tile(1, 2)
-    %tile13 = aie.tile(1, 3)
-    %tile33 = aie.tile(3, 3)
-    aie.flow(%tile12, DMA : 0, %tile33, DMA : 0) {symbol = @of1}
-    aie.flow(%tile12, DMA : 1, %tile13, DMA : 0) {symbol = @of0}
-    // In the shared memory case, the number of elements does not change.
-    aie.objectfifo @of0 (%tile12, {%tile13}, 4 : i32) : !aie.objectfifo<memref<16xi32>>
-    // In the non-adjacent memory case, the number of elements depends on the max amount acquired by
-    // the processes running on each core (here nothing is specified so it cannot be derived).
-    aie.objectfifo @of1 (%tile12, {%tile33}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
-  }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/broadcast_test.mlir b/compiler/plugins/target/AMD-AIE/aie/test/broadcast_test.mlir
deleted file mode 100644
index cbe78efec..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/broadcast_test.mlir
+++ /dev/null
@@ -1,374 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
-// The script is designed to make adding checks to
-// a test case fast, it is *not* designed to be authoritative
-// about what constitutes a good test! The CHECK should be
-// minimized and named to reflect the test intent.
-
-
-// CHECK-LABEL:   aie.device(npu1_4col) {
-// CHECK:           memref.global "public" @broadcast_of : memref<16xi32>
-// CHECK-DAG:       %[[TILE_1_2:.*]] = aie.tile(1, 2)
-// CHECK-DAG:       %[[TILE_1_3:.*]] = aie.tile(1, 3)
-// CHECK-DAG:       %[[TILE_1_4:.*]] = aie.tile(1, 4)
-// CHECK-DAG:       %[[TILE_3_2:.*]] = aie.tile(3, 2)
-// CHECK-DAG:       %[[TILE_3_3:.*]] = aie.tile(3, 3)
-// CHECK-DAG:       %[[BUFFER_1_3:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "broadcast_of_prod_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_3_0:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "broadcast_of_prod_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_1_3:.*]] = aie.lock(%[[TILE_1_3]]) {init = 2 : i8, sym_name = "broadcast_of_prod_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_1_3_1:.*]] = aie.lock(%[[TILE_1_3]]) {init = 0 : i8, sym_name = "broadcast_of_prod_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_1_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "broadcast_of_cons_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "broadcast_of_cons_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_1_2:.*]] = aie.lock(%[[TILE_1_2]]) {init = 2 : i8, sym_name = "broadcast_of_cons_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_1_2_3:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "broadcast_of_cons_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_1_4:.*]] = aie.buffer(%[[TILE_1_4]]) {sym_name = "broadcast_of_cons_buff_1_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_4_4:.*]] = aie.buffer(%[[TILE_1_4]]) {sym_name = "broadcast_of_cons_buff_1_1"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_1_4:.*]] = aie.lock(%[[TILE_1_4]]) {init = 2 : i8, sym_name = "broadcast_of_cons_prod_lock_1"}
-// CHECK-DAG:       %[[LOCK_1_4_5:.*]] = aie.lock(%[[TILE_1_4]]) {init = 0 : i8, sym_name = "broadcast_of_cons_cons_lock_1"}
-// CHECK-DAG:       %[[BUFFER_3_2:.*]] = aie.buffer(%[[TILE_3_2]]) {sym_name = "broadcast_of_cons_buff_2_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_3_2_6:.*]] = aie.buffer(%[[TILE_3_2]]) {sym_name = "broadcast_of_cons_buff_2_1"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_3_2:.*]] = aie.lock(%[[TILE_3_2]]) {init = 2 : i8, sym_name = "broadcast_of_cons_prod_lock_2"}
-// CHECK-DAG:       %[[LOCK_3_2_7:.*]] = aie.lock(%[[TILE_3_2]]) {init = 0 : i8, sym_name = "broadcast_of_cons_cons_lock_2"}
-// CHECK-DAG:       %[[BUFFER_3_3:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "broadcast_of_cons_buff_3_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_3_3_8:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "broadcast_of_cons_buff_3_1"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_3_3:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "broadcast_of_cons_prod_lock_3"}
-// CHECK-DAG:       %[[LOCK_3_3_9:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "broadcast_of_cons_cons_lock_3"}
-// CHECK-DAG:       aie.flow(%[[TILE_1_3]], DMA : 0, %[[TILE_3_3]], DMA : 0) {symbol = @broadcast_of}
-// CHECK-DAG:       aie.flow(%[[TILE_1_3]], DMA : 0, %[[TILE_3_2]], DMA : 0) {symbol = @broadcast_of}
-// CHECK-DAG:       aie.flow(%[[TILE_1_3]], DMA : 0, %[[TILE_1_4]], DMA : 0) {symbol = @broadcast_of}
-// CHECK-DAG:       aie.flow(%[[TILE_1_3]], DMA : 0, %[[TILE_1_2]], DMA : 0) {symbol = @broadcast_of}
-// CHECK:           func.func @some_work(%[[ARG0:.*]]: memref<16xi32>) {
-// CHECK:             return
-// CHECK:           }
-// CHECK:           %[[CORE_1_3:.*]] = aie.core(%[[TILE_1_3]]) {
-// CHECK:             %[[C0:.*]] = arith.constant 0 : index
-// CHECK:             %[[C1:.*]] = arith.constant 1 : index
-// CHECK:             %[[C2:.*]] = arith.constant 2 : index
-// CHECK:             %[[C12:.*]] = arith.constant 12 : index
-// CHECK:             scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C2]] {
-// CHECK:               aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1)
-// CHECK:               func.call @some_work(%[[BUFFER_1_3]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[LOCK_1_3_1]], Release, 1)
-// CHECK:               aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1)
-// CHECK:               func.call @some_work(%[[BUFFER_1_3_0]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[LOCK_1_3_1]], Release, 1)
-// CHECK:             }
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[CORE_1_2:.*]] = aie.core(%[[TILE_1_2]]) {
-// CHECK:             %[[C0:.*]] = arith.constant 0 : index
-// CHECK:             %[[C1:.*]] = arith.constant 1 : index
-// CHECK:             %[[C2:.*]] = arith.constant 2 : index
-// CHECK:             %[[C12:.*]] = arith.constant 12 : index
-// CHECK:             scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C2]] {
-// CHECK:               aie.use_lock(%[[LOCK_1_2_3]], AcquireGreaterEqual, 1)
-// CHECK:               func.call @some_work(%[[BUFFER_1_2]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[LOCK_1_2]], Release, 1)
-// CHECK:               aie.use_lock(%[[LOCK_1_2_3]], AcquireGreaterEqual, 1)
-// CHECK:               func.call @some_work(%[[BUFFER_1_2_2]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[LOCK_1_2]], Release, 1)
-// CHECK:             }
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[CORE_1_4:.*]] = aie.core(%[[TILE_1_4]]) {
-// CHECK:             %[[C0:.*]] = arith.constant 0 : index
-// CHECK:             %[[C1:.*]] = arith.constant 1 : index
-// CHECK:             %[[C3:.*]] = arith.constant 3 : index
-// CHECK:             %[[C12:.*]] = arith.constant 12 : index
-// CHECK:             scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C3]] {
-// CHECK:               aie.use_lock(%[[LOCK_1_4_5]], AcquireGreaterEqual, 2)
-// CHECK:               func.call @some_work(%[[BUFFER_1_4]]) : (memref<16xi32>) -> ()
-// CHECK:               func.call @some_work(%[[BUFFER_1_4_4]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[LOCK_1_4]], Release, 2)
-// CHECK:               aie.use_lock(%[[LOCK_1_4_5]], AcquireGreaterEqual, 2)
-// CHECK:               func.call @some_work(%[[BUFFER_1_4]]) : (memref<16xi32>) -> ()
-// CHECK:               func.call @some_work(%[[BUFFER_1_4_4]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[LOCK_1_4]], Release, 2)
-// CHECK:               aie.use_lock(%[[LOCK_1_4_5]], AcquireGreaterEqual, 2)
-// CHECK:               func.call @some_work(%[[BUFFER_1_4]]) : (memref<16xi32>) -> ()
-// CHECK:               func.call @some_work(%[[BUFFER_1_4_4]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[LOCK_1_4]], Release, 2)
-// CHECK:             }
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[CORE_3_2:.*]] = aie.core(%[[TILE_3_2]]) {
-// CHECK:             %[[C0:.*]] = arith.constant 0 : index
-// CHECK:             %[[C1:.*]] = arith.constant 1 : index
-// CHECK:             %[[C4:.*]] = arith.constant 4 : index
-// CHECK:             %[[C12:.*]] = arith.constant 12 : index
-// CHECK:             scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C4]] {
-// CHECK:               aie.use_lock(%[[LOCK_3_2_7]], AcquireGreaterEqual, 3)
-// CHECK:               func.call @some_work(%[[BUFFER_3_2]]) : (memref<16xi32>) -> ()
-// CHECK:               func.call @some_work(%[[BUFFER_3_2_6]]) : (memref<16xi32>) -> ()
-// CHECK:               func.call @some_work(%[[BUFFER_3_2]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[LOCK_3_2]], Release, 1)
-// CHECK:               aie.use_lock(%[[LOCK_3_2_7]], AcquireGreaterEqual, 3)
-// CHECK:               func.call @some_work(%[[BUFFER_3_2_6]]) : (memref<16xi32>) -> ()
-// CHECK:               func.call @some_work(%[[BUFFER_3_2]]) : (memref<16xi32>) -> ()
-// CHECK:               func.call @some_work(%[[BUFFER_3_2_6]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[LOCK_3_2]], Release, 1)
-// CHECK:               aie.use_lock(%[[LOCK_3_2_7]], AcquireGreaterEqual, 3)
-// CHECK:               func.call @some_work(%[[BUFFER_3_2]]) : (memref<16xi32>) -> ()
-// CHECK:               func.call @some_work(%[[BUFFER_3_2_6]]) : (memref<16xi32>) -> ()
-// CHECK:               func.call @some_work(%[[BUFFER_3_2]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[LOCK_3_2]], Release, 1)
-// CHECK:               aie.use_lock(%[[LOCK_3_2_7]], AcquireGreaterEqual, 3)
-// CHECK:               func.call @some_work(%[[BUFFER_3_2_6]]) : (memref<16xi32>) -> ()
-// CHECK:               func.call @some_work(%[[BUFFER_3_2]]) : (memref<16xi32>) -> ()
-// CHECK:               func.call @some_work(%[[BUFFER_3_2_6]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[LOCK_3_2]], Release, 1)
-// CHECK:             }
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[CORE_3_3:.*]] = aie.core(%[[TILE_3_3]]) {
-// CHECK:             %[[C0:.*]] = arith.constant 0 : index
-// CHECK:             %[[C1:.*]] = arith.constant 1 : index
-// CHECK:             %[[C3:.*]] = arith.constant 3 : index
-// CHECK:             %[[C12:.*]] = arith.constant 12 : index
-// CHECK:             scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C3]] {
-// CHECK:               aie.use_lock(%[[LOCK_3_3_9]], AcquireGreaterEqual, 2)
-// CHECK:               func.call @some_work(%[[BUFFER_3_3]]) : (memref<16xi32>) -> ()
-// CHECK:               func.call @some_work(%[[BUFFER_3_3_8]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[LOCK_3_3]], Release, 1)
-// CHECK:               aie.use_lock(%[[LOCK_3_3_9]], AcquireGreaterEqual, 2)
-// CHECK:               func.call @some_work(%[[BUFFER_3_3]]) : (memref<16xi32>) -> ()
-// CHECK:               func.call @some_work(%[[BUFFER_3_3_8]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[LOCK_3_3]], Release, 1)
-// CHECK:               aie.use_lock(%[[LOCK_3_3_9]], AcquireGreaterEqual, 2)
-// CHECK:               func.call @some_work(%[[BUFFER_3_3]]) : (memref<16xi32>) -> ()
-// CHECK:               func.call @some_work(%[[BUFFER_3_3_8]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[LOCK_3_3]], Release, 1)
-// CHECK:             }
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_1_3:.*]] = aie.mem(%[[TILE_1_3]]) {
-// CHECK:             %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_1_3_1]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_3]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_3]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_1_3_1]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_3_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_3]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) {
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2_3]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_2]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2_3]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_1_4:.*]] = aie.mem(%[[TILE_1_4]]) {
-// CHECK:             %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_1_4]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_4]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_4_5]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_1_4]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_4_4]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_4_5]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_3_2:.*]] = aie.mem(%[[TILE_3_2]]) {
-// CHECK:             %[[VAL_3:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_3_2]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_3_2]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_3_2_7]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_3_2]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_3_2_6]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_3_2_7]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) {
-// CHECK:             %[[VAL_4:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_3_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_3_3]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_3_3_9]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_3_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_3_3_8]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_3_3_9]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-
-
-module @broadcast {
- aie.device(npu1_4col) {
-    %tile12 = aie.tile(1, 2)
-    %tile13 = aie.tile(1, 3)
-    %tile14 = aie.tile(1, 4)
-    %tile32 = aie.tile(3, 2)
-    %tile33 = aie.tile(3, 3)
-    aie.flow(%tile13, DMA : 0, %tile33, DMA : 0) {symbol = @broadcast_of}
-    aie.flow(%tile13, DMA : 0, %tile32, DMA : 0) {symbol = @broadcast_of}
-    aie.flow(%tile13, DMA : 0, %tile14, DMA : 0) {symbol = @broadcast_of}
-    aie.flow(%tile13, DMA : 0, %tile12, DMA : 0) {symbol = @broadcast_of}
-    aie.objectfifo @broadcast_of (%tile13, {%tile12, %tile14, %tile32, %tile33}, [2]) : !aie.objectfifo<memref<16xi32>>
-    func.func @some_work(%lineOut : memref<16xi32>) -> () {
-        return
-    }
-    %core13 = aie.core(%tile13) {
-        %c0 = arith.constant 0 : index
-        %c1 = arith.constant 1 : index
-        %c2 = arith.constant 2 : index
-        %height = arith.constant 12 : index
-        scf.for %indexInHeight = %c0 to %height step %c2 {
-            %subview = aie.objectfifo.acquire @broadcast_of (Produce, 1) : !aie.objectfifosubview<memref<16xi32>>
-            %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            func.call @some_work(%elem0) : (memref<16xi32>) -> ()
-            aie.objectfifo.release @broadcast_of (Produce, 1)
-            %subview1 = aie.objectfifo.acquire @broadcast_of (Produce, 1) : !aie.objectfifosubview<memref<16xi32>>
-            %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            func.call @some_work(%elem1) : (memref<16xi32>) -> ()
-            aie.objectfifo.release @broadcast_of (Produce, 1)
-        }
-        aie.end
-    }
-    %core12 = aie.core(%tile12) {
-        %c0 = arith.constant 0 : index
-        %c1 = arith.constant 1 : index
-        %c2 = arith.constant 2 : index
-        %height = arith.constant 12 : index
-        scf.for %indexInHeight = %c0 to %height step %c2 {
-            %subview = aie.objectfifo.acquire @broadcast_of (Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
-            %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            func.call @some_work(%elem0) : (memref<16xi32>) -> ()
-            aie.objectfifo.release @broadcast_of (Consume, 1)
-            %subview1 = aie.objectfifo.acquire @broadcast_of (Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
-            %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            func.call @some_work(%elem1) : (memref<16xi32>) -> ()
-            aie.objectfifo.release @broadcast_of (Consume, 1)
-        }
-        aie.end
-    }
-    %core14 = aie.core(%tile14) {
-        %c0 = arith.constant 0 : index
-        %c1 = arith.constant 1 : index
-        %c3 = arith.constant 3 : index
-        %height = arith.constant 12 : index
-        scf.for %indexInHeight = %c0 to %height step %c3 {
-            %subview = aie.objectfifo.acquire @broadcast_of (Consume, 2) : !aie.objectfifosubview<memref<16xi32>>
-            %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            %elem1 = aie.objectfifo.subview.access %subview[1] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            func.call @some_work(%elem0) : (memref<16xi32>) -> ()
-            func.call @some_work(%elem1) : (memref<16xi32>) -> ()
-            aie.objectfifo.release @broadcast_of (Consume, 2)
-            %subview1 = aie.objectfifo.acquire @broadcast_of (Consume, 2) : !aie.objectfifosubview<memref<16xi32>>
-            %elem2 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            %elem3 = aie.objectfifo.subview.access %subview1[1] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            func.call @some_work(%elem2) : (memref<16xi32>) -> ()
-            func.call @some_work(%elem3) : (memref<16xi32>) -> ()
-            aie.objectfifo.release @broadcast_of (Consume, 2)
-            %subview2 = aie.objectfifo.acquire @broadcast_of (Consume, 2) : !aie.objectfifosubview<memref<16xi32>>
-            %elem4 = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            %elem5 = aie.objectfifo.subview.access %subview2[1] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            func.call @some_work(%elem4) : (memref<16xi32>) -> ()
-            func.call @some_work(%elem5) : (memref<16xi32>) -> ()
-            aie.objectfifo.release @broadcast_of (Consume, 2)
-        }
-        aie.end
-    }
-    %core32 = aie.core(%tile32) {
-        %c0 = arith.constant 0 : index
-        %c1 = arith.constant 1 : index
-        %c4 = arith.constant 4 : index
-        %height = arith.constant 12 : index
-        scf.for %indexInHeight = %c0 to %height step %c4 {
-            %subview = aie.objectfifo.acquire @broadcast_of (Consume, 3) : !aie.objectfifosubview<memref<16xi32>>
-            %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            %elem1 = aie.objectfifo.subview.access %subview[1] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            %elem2 = aie.objectfifo.subview.access %subview[2] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            func.call @some_work(%elem0) : (memref<16xi32>) -> ()
-            func.call @some_work(%elem1) : (memref<16xi32>) -> ()
-            func.call @some_work(%elem2) : (memref<16xi32>) -> ()
-            aie.objectfifo.release @broadcast_of (Consume, 1)
-            %subview1 = aie.objectfifo.acquire @broadcast_of (Consume, 3) : !aie.objectfifosubview<memref<16xi32>>
-            %elem3 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            %elem4 = aie.objectfifo.subview.access %subview1[1] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            %elem5 = aie.objectfifo.subview.access %subview1[2] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            func.call @some_work(%elem3) : (memref<16xi32>) -> ()
-            func.call @some_work(%elem4) : (memref<16xi32>) -> ()
-            func.call @some_work(%elem5) : (memref<16xi32>) -> ()
-            aie.objectfifo.release @broadcast_of (Consume, 1)
-
-            %subview2 = aie.objectfifo.acquire @broadcast_of (Consume, 3) : !aie.objectfifosubview<memref<16xi32>>
-            %elem6 = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            %elem7 = aie.objectfifo.subview.access %subview2[1] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            %elem8 = aie.objectfifo.subview.access %subview2[2] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            func.call @some_work(%elem6) : (memref<16xi32>) -> ()
-            func.call @some_work(%elem7) : (memref<16xi32>) -> ()
-            func.call @some_work(%elem8) : (memref<16xi32>) -> ()
-            aie.objectfifo.release @broadcast_of (Consume, 1)
-
-            %subview3 = aie.objectfifo.acquire @broadcast_of (Consume, 3) : !aie.objectfifosubview<memref<16xi32>>
-            %elem9 = aie.objectfifo.subview.access %subview3[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            %elem10 = aie.objectfifo.subview.access %subview3[1] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            %elem11 = aie.objectfifo.subview.access %subview3[2] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            func.call @some_work(%elem9) : (memref<16xi32>) -> ()
-            func.call @some_work(%elem10) : (memref<16xi32>) -> ()
-            func.call @some_work(%elem11) : (memref<16xi32>) -> ()
-            aie.objectfifo.release @broadcast_of (Consume, 1)
-        }
-        aie.end
-    }
-    %core33 = aie.core(%tile33) {
-        %c0 = arith.constant 0 : index
-        %c1 = arith.constant 1 : index
-        %c3 = arith.constant 3 : index
-        %height = arith.constant 12 : index
-        scf.for %indexInHeight = %c0 to %height step %c3 {
-            %subview = aie.objectfifo.acquire @broadcast_of (Consume, 2) : !aie.objectfifosubview<memref<16xi32>>
-            %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            %elem1 = aie.objectfifo.subview.access %subview[1] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            func.call @some_work(%elem0) : (memref<16xi32>) -> ()
-            func.call @some_work(%elem1) : (memref<16xi32>) -> ()
-            aie.objectfifo.release @broadcast_of (Consume, 1)
-
-            %subview1 = aie.objectfifo.acquire @broadcast_of (Consume, 2) : !aie.objectfifosubview<memref<16xi32>>
-            %elem2 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            %elem3 = aie.objectfifo.subview.access %subview1[1] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            func.call @some_work(%elem2) : (memref<16xi32>) -> ()
-            func.call @some_work(%elem3) : (memref<16xi32>) -> ()
-            aie.objectfifo.release @broadcast_of (Consume, 1)
-
-            %subview2 = aie.objectfifo.acquire @broadcast_of (Consume, 2) : !aie.objectfifosubview<memref<16xi32>>
-            %elem4 = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            %elem5 = aie.objectfifo.subview.access %subview2[1] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            func.call @some_work(%elem4) : (memref<16xi32>) -> ()
-            func.call @some_work(%elem5) : (memref<16xi32>) -> ()
-            aie.objectfifo.release @broadcast_of (Consume, 1)
-        }
-        aie.end
-    }
- }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/link_test_AIE1.mlir b/compiler/plugins/target/AMD-AIE/aie/test/link_test_AIE1.mlir
deleted file mode 100644
index 38d37b926..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/link_test_AIE1.mlir
+++ /dev/null
@@ -1,80 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(xcvc1902) {
-// CHECK:           memref.global "public" @of2 : memref<16xi32>
-// CHECK:           memref.global "public" @of1 : memref<16xi32>
-// CHECK-DAG:       %[[TILE_2_0:.*]] = aie.tile(2, 0)
-// CHECK-DAG:       %[[TILE_1_2:.*]] = aie.tile(1, 2)
-// CHECK-DAG:       %[[TILE_2_2:.*]] = aie.tile(2, 2)
-// CHECK-DAG:       %[[OF2_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of2_cons_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[OF2_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of2_cons_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[OF2_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "of2_cons_prod_lock_0"}
-// CHECK-DAG:       %[[OF2_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "of2_cons_cons_lock_0"}
-// CHECK-DAG:       %[[OF1_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of1_link_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[OF1_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of1_link_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[OF1_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 2 : i8, sym_name = "of1_link_prod_lock_0"}
-// CHECK-DAG:       %[[OF1_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of1_link_cons_lock_0"}
-// CHECK-DAG:       %[[OF1_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "of1_prod_prod_lock_0"}
-// CHECK-DAG:       %[[OF1_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "of1_prod_cons_lock_0"}
-// CHECK-DAG:       aie.flow(%[[TILE_2_0]], DMA : 0, %[[TILE_1_2]], DMA : 0)
-// CHECK-DAG:       aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_2_2]], DMA : 0)
-// CHECK-DAG:       %[[EXT_BUFF_IN:.*]] = aie.external_buffer {sym_name = "ext_buff_in"} : memref<16xi32>
-// CHECK-DAG:       aie.shim_dma_allocation @of1(MM2S, 0, 2)
-// CHECK:           %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) {
-// CHECK:             %[[VAL_0:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OF1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF1_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF1_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OF1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF1_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF1_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(MM2S, 0, ^bb4, ^bb6)
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[OF1_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF1_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF1_CONS_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb5
-// CHECK:           ^bb5:
-// CHECK:             aie.use_lock(%[[OF1_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF1_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF1_CONS_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb6:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) {
-// CHECK:             %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OF2_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF2_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF2_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OF2_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF2_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF2_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-module @link_AIE1 {
-  aie.device(xcvc1902) {
-    %tile20 = aie.tile(2, 0)
-    %tile12 = aie.tile(1, 2)
-    %tile22 = aie.tile(2, 2)
-    aie.flow(%tile20, DMA : 0, %tile12, DMA : 0) {symbol = @of1}
-    aie.flow(%tile12, DMA : 0, %tile22, DMA : 0) {symbol = @of2}
-    aie.objectfifo @of1 (%tile20, {%tile12}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
-    aie.objectfifo @of2 (%tile12, {%tile22}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
-    aie.objectfifo.link [@of1] -> [@of2] ([] [])
-    %ext_buff_in = aie.external_buffer {sym_name = "ext_buff_in"} : memref<16xi32>
-    aie.objectfifo.register_external_buffers @of1 (%tile20, {%ext_buff_in}) : (memref<16xi32>)
-  }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/link_test_DDR_to_L1.mlir b/compiler/plugins/target/AMD-AIE/aie/test/link_test_DDR_to_L1.mlir
deleted file mode 100644
index dbd14e8f2..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/link_test_DDR_to_L1.mlir
+++ /dev/null
@@ -1,80 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(xcve2302) {
-// CHECK:           memref.global "public" @from_memTile : memref<16xi32>
-// CHECK:           memref.global "public" @to_memTile : memref<16xi32>
-// CHECK-DAG:       %[[TILE_2_0:.*]] = aie.tile(2, 0)
-// CHECK-DAG:       %[[TILE_2_1:.*]] = aie.tile(2, 1)
-// CHECK-DAG:       %[[TILE_2_2:.*]] = aie.tile(2, 2)
-// CHECK-DAG:       %[[FROM_MEMTILE_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "from_memTile_cons_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[FROM_MEMTILE_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "from_memTile_cons_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[FROM_MEMTILE_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "from_memTile_cons_prod_lock_0"}
-// CHECK-DAG:       %[[FROM_MEMTILE_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "from_memTile_cons_cons_lock_0"}
-// CHECK-DAG:       %[[TO_MEMTILE_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "to_memTile_link_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[TO_MEMTILE_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "to_memTile_link_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[TO_MEMTILE_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 2 : i8, sym_name = "to_memTile_link_prod_lock_0"}
-// CHECK-DAG:       %[[TO_MEMTILE_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 0 : i8, sym_name = "to_memTile_link_cons_lock_0"}
-// CHECK-DAG:       %[[TO_MEMTILE_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "to_memTile_prod_prod_lock_0"}
-// CHECK-DAG:       %[[TO_MEMTILE_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "to_memTile_prod_cons_lock_0"}
-// CHECK-DAG:       aie.flow(%[[TILE_2_0]], DMA : 0, %[[TILE_2_1]], DMA : 0)
-// CHECK-DAG:       aie.flow(%[[TILE_2_1]], DMA : 0, %[[TILE_2_2]], DMA : 0)
-// CHECK-DAG:       %[[EXT_BUFF_IN:.*]] = aie.external_buffer {sym_name = "ext_buff_in"} : memref<16xi32>
-// CHECK-DAG:       aie.shim_dma_allocation @to_memTile(MM2S, 0, 2)
-// CHECK:           %[[MEMTILE_DMA_2_1:.*]] = aie.memtile_dma(%[[TILE_2_1]]) {
-// CHECK:             %[[VAL_0:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[TO_MEMTILE_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[TO_MEMTILE_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[TO_MEMTILE_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[TO_MEMTILE_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[TO_MEMTILE_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[TO_MEMTILE_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(MM2S, 0, ^bb4, ^bb6)
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[TO_MEMTILE_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[TO_MEMTILE_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[TO_MEMTILE_CONS_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb5
-// CHECK:           ^bb5:
-// CHECK:             aie.use_lock(%[[TO_MEMTILE_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[TO_MEMTILE_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[TO_MEMTILE_CONS_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb6:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) {
-// CHECK:             %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[FROM_MEMTILE_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[FROM_MEMTILE_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[FROM_MEMTILE_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[FROM_MEMTILE_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[FROM_MEMTILE_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[FROM_MEMTILE_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-module @link_DDR_L1 {
-    aie.device(xcve2302) {
-        %tile20 = aie.tile(2, 0)
-        %tile21 = aie.tile(2, 1)
-        %tile22 = aie.tile(2, 2)
-        aie.flow(%tile20, DMA : 0, %tile21, DMA : 0) {symbol = @to_memTile}
-        aie.flow(%tile21, DMA : 0, %tile22, DMA : 0) {symbol = @from_memTile}
-        aie.objectfifo @to_memTile (%tile20, {%tile21}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
-        aie.objectfifo @from_memTile (%tile21, {%tile22}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
-        aie.objectfifo.link [@to_memTile] -> [@from_memTile] ([] [])
-        %ext_buff_in = aie.external_buffer {sym_name = "ext_buff_in"}: memref<16xi32>
-        aie.objectfifo.register_external_buffers @to_memTile (%tile20, {%ext_buff_in}) : (memref<16xi32>)
-    }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/link_test_L1_to_DDR.mlir b/compiler/plugins/target/AMD-AIE/aie/test/link_test_L1_to_DDR.mlir
deleted file mode 100644
index c022a2a62..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/link_test_L1_to_DDR.mlir
+++ /dev/null
@@ -1,81 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(xcve2302) {
-// CHECK:           memref.global "public" @from_memTile : memref<48xi32>
-// CHECK:           memref.global "public" @to_memTile : memref<16xi32>
-// CHECK-DAG:       %[[TILE_2_0:.*]] = aie.tile(2, 0)
-// CHECK-DAG:       %[[TILE_2_1:.*]] = aie.tile(2, 1)
-// CHECK-DAG:       %[[TILE_2_2:.*]] = aie.tile(2, 2)
-// CHECK-DAG:       %[[FROM_MEMTILE_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "from_memTile_cons_prod_lock_0"}
-// CHECK-DAG:       %[[FROM_MEMTILE_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "from_memTile_cons_cons_lock_0"}
-// CHECK-DAG:       %[[FROM_MEMTILE_BUFF_0:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "from_memTile_link_buff_0_0"} : memref<48xi32>
-// CHECK-DAG:       %[[FROM_MEMTILE_BUFF_1:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "from_memTile_link_buff_0_1"} : memref<48xi32>
-// CHECK-DAG:       %[[FROM_MEMTILE_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 2 : i8, sym_name = "from_memTile_link_prod_lock_0"}
-// CHECK-DAG:       %[[FROM_MEMTILE_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 0 : i8, sym_name = "from_memTile_link_cons_lock_0"}
-// CHECK-DAG:       %[[TO_MEMTILE_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "to_memTile_prod_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[TO_MEMTILE_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "to_memTile_prod_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[TO_MEMTILE_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "to_memTile_prod_prod_lock_0"}
-// CHECK-DAG:       %[[TO_MEMTILE_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "to_memTile_prod_cons_lock_0"}
-// CHECK-DAG:       aie.flow(%[[TILE_2_2]], DMA : 0, %[[TILE_2_1]], DMA : 0)
-// CHECK-DAG:       aie.flow(%[[TILE_2_1]], DMA : 0, %[[TILE_2_0]], DMA : 0)
-// CHECK-DAG:       %[[EXT_BUFF_IN:.*]] = aie.external_buffer {sym_name = "ext_buff_in"} : memref<48xi32>
-// CHECK:           %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) {
-// CHECK:             %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[TO_MEMTILE_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[TO_MEMTILE_BUFF_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[TO_MEMTILE_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[TO_MEMTILE_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[TO_MEMTILE_BUFF_1]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[TO_MEMTILE_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           aie.shim_dma_allocation @from_memTile(S2MM, 0, 2)
-// CHECK:           %[[MEMTILE_DMA_2_1:.*]] = aie.memtile_dma(%[[TILE_2_1]]) {
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[FROM_MEMTILE_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[FROM_MEMTILE_BUFF_0]] : memref<48xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[FROM_MEMTILE_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[FROM_MEMTILE_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[FROM_MEMTILE_BUFF_1]] : memref<48xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[FROM_MEMTILE_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             %[[VAL_2:.*]] = aie.dma_start(MM2S, 0, ^bb4, ^bb6)
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[FROM_MEMTILE_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[FROM_MEMTILE_BUFF_0]] : memref<48xi32>) {len = 48 : i32}
-// CHECK:             aie.use_lock(%[[FROM_MEMTILE_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb5
-// CHECK:           ^bb5:
-// CHECK:             aie.use_lock(%[[FROM_MEMTILE_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[FROM_MEMTILE_BUFF_1]] : memref<48xi32>) {len = 48 : i32}
-// CHECK:             aie.use_lock(%[[FROM_MEMTILE_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb6:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-
-module @link_L1_DDR {
-    aie.device(xcve2302) {
-        %tile20 = aie.tile(2, 0)
-        %tile21 = aie.tile(2, 1)
-        %tile22 = aie.tile(2, 2)
-        aie.flow(%tile22, DMA : 0, %tile21, DMA : 0) {symbol = @to_memTile}
-        aie.flow(%tile21, DMA : 0, %tile20, DMA : 0) {symbol = @from_memTile}
-        aie.objectfifo @to_memTile (%tile22, {%tile21}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
-        aie.objectfifo @from_memTile (%tile21, {%tile20}, 2 : i32) : !aie.objectfifo<memref<48xi32>>
-        aie.objectfifo.link [@to_memTile] -> [@from_memTile] ([] [])
-        %ext_buff_in = aie.external_buffer {sym_name = "ext_buff_in"}: memref<48xi32>
-        aie.objectfifo.register_external_buffers @from_memTile (%tile20, {%ext_buff_in}) : (memref<48xi32>)
-    }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/link_test_broadcast.mlir b/compiler/plugins/target/AMD-AIE/aie/test/link_test_broadcast.mlir
deleted file mode 100644
index 7379bdbe7..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/link_test_broadcast.mlir
+++ /dev/null
@@ -1,136 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(xcve2302) {
-// CHECK:           memref.global "public" @skip_connection : memref<16xi32>
-// CHECK:           memref.global "public" @link2 : memref<16xi32>
-// CHECK:           memref.global "public" @link1 : memref<48xi32>
-// CHECK-DAG:       %[[TILE_2_0:.*]] = aie.tile(2, 0)
-// CHECK-DAG:       %[[TILE_2_1:.*]] = aie.tile(2, 1)
-// CHECK-DAG:       %[[TILE_2_2:.*]] = aie.tile(2, 2)
-// CHECK-DAG:       %[[TILE_3_3:.*]] = aie.tile(3, 3)
-// CHECK-DAG:       %[[BUFFER_2_2:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "skip_connection_prod_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_2_2_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "skip_connection_prod_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_2_2:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "skip_connection_prod_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_2_2_1:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "skip_connection_prod_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_3_3:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "skip_connection_cons_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_3_3_2:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "skip_connection_cons_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_3_3:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "skip_connection_cons_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_3_3_3:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "skip_connection_cons_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_2_1:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "link1_link_buff_0_0"} : memref<48xi32>
-// CHECK-DAG:       %[[BUFFER_2_1_4:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "link1_link_buff_0_1"} : memref<48xi32>
-// CHECK-DAG:       %[[LOCK_2_1:.*]] = aie.lock(%[[TILE_2_1]]) {init = 2 : i8, sym_name = "link1_link_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_2_1_5:.*]] = aie.lock(%[[TILE_2_1]]) {init = 0 : i8, sym_name = "link1_link_cons_lock_0"}
-// CHECK-DAG:       %[[LOCK_2_0:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "link1_prod_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_2_0_6:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "link1_prod_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_2_2_7:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "link2_cons_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_2_2_8:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "link2_cons_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_2_2_9:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "link2_cons_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_2_2_10:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "link2_cons_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_3_3_11:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "link2_cons_buff_1_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_3_3_12:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "link2_cons_buff_1_1"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_3_3_13:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "link2_cons_prod_lock_1"}
-// CHECK-DAG:       %[[LOCK_3_3_14:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "link2_cons_cons_lock_1"}
-// CHECK-DAG:       aie.flow(%[[TILE_2_0]], DMA : 0, %[[TILE_2_1]], DMA : 0) {symbol = @link1}
-// CHECK-DAG:       aie.flow(%[[TILE_2_1]], DMA : 0, %[[TILE_3_3]], DMA : 0) {symbol = @link2}
-// CHECK-DAG:       aie.flow(%[[TILE_2_1]], DMA : 0, %[[TILE_2_2]], DMA : 0) {symbol = @link2}
-// CHECK-DAG:       aie.flow(%[[TILE_2_2]], DMA : 0, %[[TILE_3_3]], DMA : 1) {symbol = @skip_connection}
-// CHECK-DAG:       aie.shim_dma_allocation @link1(MM2S, 0, 2)
-// CHECK:           %[[MEMTILE_DMA_2_1:.*]] = aie.memtile_dma(%[[TILE_2_1]]) {
-// CHECK:             %[[VAL_0:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_2_1]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_2_1]] : memref<48xi32>) {len = 48 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_2_1_5]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_2_1]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_2_1_4]] : memref<48xi32>) {len = 48 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_2_1_5]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(MM2S, 0, ^bb4, ^bb6)
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[LOCK_2_1_5]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_2_1]] : memref<48xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_2_1]], Release, 1)
-// CHECK:             aie.next_bd ^bb5
-// CHECK:           ^bb5:
-// CHECK:             aie.use_lock(%[[LOCK_2_1_5]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_2_1_4]] : memref<48xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_2_1]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb6:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) {
-// CHECK:             %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_2_2_9]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_2_2_7]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_2_2_10]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_2_2_9]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_2_2_8]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_2_2_10]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             %[[VAL_3:.*]] = aie.dma_start(MM2S, 0, ^bb4, ^bb6)
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[LOCK_2_2_1]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_2_2]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_2_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb5
-// CHECK:           ^bb5:
-// CHECK:             aie.use_lock(%[[LOCK_2_2_1]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_2_2_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_2_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb6:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) {
-// CHECK:             %[[VAL_4:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_3_3_13]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_3_3_11]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_3_3_14]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_3_3_13]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_3_3_12]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_3_3_14]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             %[[VAL_5:.*]] = aie.dma_start(S2MM, 1, ^bb4, ^bb6)
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[LOCK_3_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_3_3]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_3_3_3]], Release, 1)
-// CHECK:             aie.next_bd ^bb5
-// CHECK:           ^bb5:
-// CHECK:             aie.use_lock(%[[LOCK_3_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_3_3_2]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_3_3_3]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb6:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-module @link_broadcast {
-  aie.device(xcve2302) {
-    %tile20 = aie.tile(2, 0)
-    %tile21 = aie.tile(2, 1)
-    %tile22 = aie.tile(2, 2)
-    %tile33 = aie.tile(3, 3)
-    aie.flow(%tile20, DMA : 0, %tile21, DMA : 0) {symbol = @link1}
-    aie.flow(%tile21, DMA : 0, %tile33, DMA : 0) {symbol = @link2}
-    aie.flow(%tile21, DMA : 0, %tile22, DMA : 0) {symbol = @link2}
-    aie.flow(%tile22, DMA : 0, %tile33, DMA : 1) {symbol = @skip_connection}
-    aie.objectfifo @link1 (%tile20, {%tile21}, 2 : i32) : !aie.objectfifo<memref<48xi32>>
-    aie.objectfifo @link2 (%tile21, {%tile22, %tile33}, [2]) : !aie.objectfifo<memref<16xi32>>
-    aie.objectfifo @skip_connection (%tile22, {%tile33}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
-    aie.objectfifo.link [@link1] -> [@link2] ([] [])
-  }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/link_test_distribute.mlir b/compiler/plugins/target/AMD-AIE/aie/test/link_test_distribute.mlir
deleted file mode 100644
index 9eb8048e5..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/link_test_distribute.mlir
+++ /dev/null
@@ -1,155 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(xcve2302) {
-// CHECK:           memref.global "public" @link4 : memref<12xi32>
-// CHECK:           memref.global "public" @link3 : memref<20xi32>
-// CHECK:           memref.global "public" @link2 : memref<4x4xi32>
-// CHECK:           memref.global "public" @link1 : memref<48xi32>
-// CHECK-DAG:       %[[TILE_2_0:.*]] = aie.tile(2, 0)
-// CHECK-DAG:       %[[TILE_2_1:.*]] = aie.tile(2, 1)
-// CHECK-DAG:       %[[TILE_2_2:.*]] = aie.tile(2, 2)
-// CHECK-DAG:       %[[TILE_2_3:.*]] = aie.tile(2, 3)
-// CHECK-DAG:       %[[TILE_3_3:.*]] = aie.tile(3, 3)
-// CHECK-DAG:       %[[LINK4_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "link4_cons_buff_2_0"} : memref<12xi32>
-// CHECK-DAG:       %[[LINK4_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "link4_cons_buff_2_1"} : memref<12xi32>
-// CHECK-DAG:       %[[LINK4_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "link4_cons_prod_lock_2"}
-// CHECK-DAG:       %[[LINK4_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "link4_cons_cons_lock_2"}
-// CHECK-DAG:       %[[LINK3_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "link3_cons_buff_1_0"} : memref<20xi32>
-// CHECK-DAG:       %[[LINK3_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "link3_cons_buff_1_1"} : memref<20xi32>
-// CHECK-DAG:       %[[LINK3_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_3]]) {init = 2 : i8, sym_name = "link3_cons_prod_lock_1"}
-// CHECK-DAG:       %[[LINK3_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_3]]) {init = 0 : i8, sym_name = "link3_cons_cons_lock_1"}
-// CHECK-DAG:       %[[LINK2_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "link2_cons_buff_0_0"} : memref<4x4xi32>
-// CHECK-DAG:       %[[LINK2_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "link2_cons_buff_0_1"} : memref<4x4xi32>
-// CHECK-DAG:       %[[LINK2_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "link2_cons_prod_lock_0"}
-// CHECK-DAG:       %[[LINK2_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "link2_cons_cons_lock_0"}
-// CHECK-DAG:       %[[LINK1_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "link1_link_buff_0_0"} : memref<48xi32>
-// CHECK-DAG:       %[[LINK1_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "link1_link_buff_0_1"} : memref<48xi32>
-// CHECK-DAG:       %[[LINK1_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 6 : i8, sym_name = "link1_link_prod_lock_0"}
-// CHECK-DAG:       %[[LINK1_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 0 : i8, sym_name = "link1_link_cons_lock_0"}
-// CHECK-DAG:       %[[LINK1_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "link1_prod_prod_lock_0"}
-// CHECK-DAG:       %[[LINK1_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "link1_prod_cons_lock_0"}
-// CHECK-DAG:       aie.flow(%[[TILE_2_0]], DMA : 0, %[[TILE_2_1]], DMA : 0)
-// CHECK-DAG:       aie.flow(%[[TILE_2_1]], DMA : 0, %[[TILE_2_2]], DMA : 0)
-// CHECK-DAG:       aie.flow(%[[TILE_2_1]], DMA : 1, %[[TILE_2_3]], DMA : 0)
-// CHECK-DAG:       aie.flow(%[[TILE_2_1]], DMA : 2, %[[TILE_3_3]], DMA : 0)
-// CHECK-DAG:       %[[EXT_BUFFER_IN:.*]] = aie.external_buffer {sym_name = "ext_buffer_in"} : memref<48xi32>
-// CHECK-DAG:       aie.shim_dma_allocation @link1(MM2S, 0, 2)
-// CHECK:           %[[MEMTILE_DMA_2_1:.*]] = aie.memtile_dma(%[[TILE_2_1]]) {
-// CHECK:             %[[VAL_0:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LINK1_CONS_PROD_LOCK]], AcquireGreaterEqual, 3)
-// CHECK:             aie.dma_bd(%[[LINK1_CONS_BUFF_0]] : memref<48xi32>) {len = 48 : i32}
-// CHECK:             aie.use_lock(%[[LINK1_CONS_CONS_LOCK]], Release, 3)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LINK1_CONS_PROD_LOCK]], AcquireGreaterEqual, 3)
-// CHECK:             aie.dma_bd(%[[LINK1_CONS_BUFF_1]] : memref<48xi32>) {len = 48 : i32}
-// CHECK:             aie.use_lock(%[[LINK1_CONS_CONS_LOCK]], Release, 3)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(MM2S, 0, ^bb4, ^bb6)
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[LINK1_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK1_CONS_BUFF_0]] : memref<48xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LINK1_CONS_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb5
-// CHECK:           ^bb5:
-// CHECK:             aie.use_lock(%[[LINK1_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK1_CONS_BUFF_1]] : memref<48xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LINK1_CONS_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb6:
-// CHECK:             %[[VAL_2:.*]] = aie.dma_start(MM2S, 1, ^bb7, ^bb9)
-// CHECK:           ^bb7:
-// CHECK:             aie.use_lock(%[[LINK1_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK1_CONS_BUFF_0]] : memref<48xi32>) {len = 20 : i32, offset = 16 : i32}
-// CHECK:             aie.use_lock(%[[LINK1_CONS_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb8
-// CHECK:           ^bb8:
-// CHECK:             aie.use_lock(%[[LINK1_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK1_CONS_BUFF_1]] : memref<48xi32>) {len = 20 : i32, offset = 16 : i32}
-// CHECK:             aie.use_lock(%[[LINK1_CONS_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb7
-// CHECK:           ^bb9:
-// CHECK:             %[[VAL_3:.*]] = aie.dma_start(MM2S, 2, ^bb10, ^bb12)
-// CHECK:           ^bb10:
-// CHECK:             aie.use_lock(%[[LINK1_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK1_CONS_BUFF_0]] : memref<48xi32>) {len = 12 : i32, offset = 36 : i32}
-// CHECK:             aie.use_lock(%[[LINK1_CONS_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb11
-// CHECK:           ^bb11:
-// CHECK:             aie.use_lock(%[[LINK1_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK1_CONS_BUFF_1]] : memref<48xi32>) {len = 12 : i32, offset = 36 : i32}
-// CHECK:             aie.use_lock(%[[LINK1_CONS_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb10
-// CHECK:           ^bb12:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) {
-// CHECK:             %[[VAL_4:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LINK2_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK2_CONS_BUFF_0]] : memref<4x4xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LINK2_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LINK2_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK2_CONS_BUFF_1]] : memref<4x4xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LINK2_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_2_3:.*]] = aie.mem(%[[TILE_2_3]]) {
-// CHECK:             %[[VAL_5:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LINK3_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK3_CONS_BUFF_0]] : memref<20xi32>) {len = 20 : i32}
-// CHECK:             aie.use_lock(%[[LINK3_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LINK3_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK3_CONS_BUFF_1]] : memref<20xi32>) {len = 20 : i32}
-// CHECK:             aie.use_lock(%[[LINK3_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) {
-// CHECK:             %[[VAL_6:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LINK4_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK4_CONS_BUFF_0]] : memref<12xi32>) {len = 12 : i32}
-// CHECK:             aie.use_lock(%[[LINK4_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LINK4_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK4_CONS_BUFF_1]] : memref<12xi32>) {len = 12 : i32}
-// CHECK:             aie.use_lock(%[[LINK4_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-
-module @link_distribute {
-  aie.device(xcve2302) {
-    %tile20 = aie.tile(2, 0)
-    %tile21 = aie.tile(2, 1)
-    %tile22 = aie.tile(2, 2)
-    %tile23 = aie.tile(2, 3)
-    %tile33 = aie.tile(3, 3)
-    aie.flow(%tile20, DMA : 0, %tile21, DMA : 0) {symbol = @link1}
-    aie.flow(%tile21, DMA : 0, %tile22, DMA : 0) {symbol = @link2}
-    aie.flow(%tile21, DMA : 1, %tile23, DMA : 0) {symbol = @link3}
-    aie.flow(%tile21, DMA : 2, %tile33, DMA : 0) {symbol = @link4}
-    aie.objectfifo @link1 (%tile20, {%tile21}, 2 : i32) : !aie.objectfifo<memref<48xi32>>
-    aie.objectfifo @link2 (%tile21, {%tile22}, 2 : i32) : !aie.objectfifo<memref<4x4xi32>>
-    aie.objectfifo @link3 (%tile21, {%tile23}, 2 : i32) : !aie.objectfifo<memref<20xi32>>
-    aie.objectfifo @link4 (%tile21, {%tile33}, 2 : i32) : !aie.objectfifo<memref<12xi32>>
-    %ext_buffer_in  = aie.external_buffer {sym_name = "ext_buffer_in"}: memref<48xi32>
-    aie.objectfifo.register_external_buffers @link1 (%tile20, {%ext_buffer_in}) : (memref<48xi32>)
-    aie.objectfifo.link [@link1] -> [@link2, @link3, @link4] ([] [])
-  }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/link_test_join.mlir b/compiler/plugins/target/AMD-AIE/aie/test/link_test_join.mlir
deleted file mode 100644
index 101d94a45..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/link_test_join.mlir
+++ /dev/null
@@ -1,191 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(xcve2302) {
-// CHECK-DAG:       memref.global "public" @link5 : memref<512xi8>
-// CHECK-DAG:       memref.global "public" @link4 : memref<128xi8>
-// CHECK-DAG:       memref.global "public" @link3 : memref<128xi8>
-// CHECK-DAG:       memref.global "public" @link2 : memref<128xi8>
-// CHECK-DAG:       memref.global "public" @link1 : memref<128xi8>
-// CHECK-DAG:       %[[TILE_2_0:.*]] = aie.tile(2, 0)
-// CHECK-DAG:       %[[TILE_2_1:.*]] = aie.tile(2, 1)
-// CHECK-DAG:       %[[TILE_1_2:.*]] = aie.tile(1, 2)
-// CHECK-DAG:       %[[TILE_2_2:.*]] = aie.tile(2, 2)
-// CHECK-DAG:       %[[TILE_2_3:.*]] = aie.tile(2, 3)
-// CHECK-DAG:       %[[TILE_3_3:.*]] = aie.tile(3, 3)
-// CHECK-DAG:       %[[LINK5_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "link5_cons_prod_lock_0"}
-// CHECK-DAG:       %[[LINK5_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "link5_cons_cons_lock_0"}
-// CHECK-DAG:       %[[LINK5_BUFF_0:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "link5_link_buff_0_0"} : memref<512xi8>
-// CHECK-DAG:       %[[LINK5_BUFF_1:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "link5_link_buff_0_1"} : memref<512xi8>
-// CHECK-DAG:       %[[LINK5_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 8 : i8, sym_name = "link5_link_prod_lock_0"}
-// CHECK-DAG:       %[[LINK5_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 0 : i8, sym_name = "link5_link_cons_lock_0"}
-// CHECK-DAG:       %[[LINK4_BUFF_0:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "link4_prod_buff_3_0"} : memref<128xi8>
-// CHECK-DAG:       %[[LINK4_BUFF_1:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "link4_prod_buff_3_1"} : memref<128xi8>
-// CHECK-DAG:       %[[LINK4_PROD_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "link4_prod_prod_lock_3"}
-// CHECK-DAG:       %[[LINK4_CONS_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "link4_prod_cons_lock_3"}
-// CHECK-DAG:       %[[LINK3_BUFF_0:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "link3_prod_buff_2_0"} : memref<128xi8>
-// CHECK-DAG:       %[[LINK3_BUFF_1:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "link3_prod_buff_2_1"} : memref<128xi8>
-// CHECK-DAG:       %[[LINK3_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_3]]) {init = 2 : i8, sym_name = "link3_prod_prod_lock_2"}
-// CHECK-DAG:       %[[LINK3_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_3]]) {init = 0 : i8, sym_name = "link3_prod_cons_lock_2"}
-// CHECK-DAG:       %[[LINK2_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "link2_prod_buff_1_0"} : memref<128xi8>
-// CHECK-DAG:       %[[LINK2_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "link2_prod_buff_1_1"} : memref<128xi8>
-// CHECK-DAG:       %[[LINK2_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "link2_prod_prod_lock_1"}
-// CHECK-DAG:       %[[LINK2_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "link2_prod_cons_lock_1"}
-// CHECK-DAG:       %[[LINK1_BUFF_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "link1_prod_buff_0_0"} : memref<128xi8>
-// CHECK-DAG:       %[[LINK1_BUFF_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "link1_prod_buff_0_1"} : memref<128xi8>
-// CHECK-DAG:       %[[LINK1_PROD_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 2 : i8, sym_name = "link1_prod_prod_lock_0"}
-// CHECK-DAG:       %[[LINK1_CONS_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "link1_prod_cons_lock_0"}
-// CHECK-DAG:       aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_2_1]], DMA : 0)
-// CHECK-DAG:       aie.flow(%[[TILE_2_2]], DMA : 0, %[[TILE_2_1]], DMA : 1)
-// CHECK-DAG:       aie.flow(%[[TILE_2_3]], DMA : 0, %[[TILE_2_1]], DMA : 2)
-// CHECK-DAG:       aie.flow(%[[TILE_3_3]], DMA : 0, %[[TILE_2_1]], DMA : 3)
-// CHECK-DAG:       aie.flow(%[[TILE_2_1]], DMA : 0, %[[TILE_2_0]], DMA : 0)
-// CHECK-DAG:       %[[EXT_BUFFER_IN:.*]] = aie.external_buffer {sym_name = "ext_buffer_in"} : memref<512xi8>
-// CHECK:           %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) {
-// CHECK:             %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LINK1_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK1_BUFF_0]] : memref<128xi8>) {len = 128 : i32}
-// CHECK:             aie.use_lock(%[[LINK1_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LINK1_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK1_BUFF_1]] : memref<128xi8>) {len = 128 : i32}
-// CHECK:             aie.use_lock(%[[LINK1_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEMTILE_DMA_2_1:.*]] = aie.memtile_dma(%[[TILE_2_1]]) {
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LINK5_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK5_BUFF_0]] : memref<512xi8>) {len = 128 : i32}
-// CHECK:             aie.use_lock(%[[LINK5_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LINK5_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK5_BUFF_1]] : memref<512xi8>) {len = 128 : i32}
-// CHECK:             aie.use_lock(%[[LINK5_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             %[[VAL_2:.*]] = aie.dma_start(S2MM, 1, ^bb4, ^bb6)
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[LINK5_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK5_BUFF_0]] : memref<512xi8>) {len = 128 : i32, offset = 128 : i32}
-// CHECK:             aie.use_lock(%[[LINK5_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb5
-// CHECK:           ^bb5:
-// CHECK:             aie.use_lock(%[[LINK5_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK5_BUFF_1]] : memref<512xi8>) {len = 128 : i32, offset = 128 : i32}
-// CHECK:             aie.use_lock(%[[LINK5_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb6:
-// CHECK:             %[[VAL_3:.*]] = aie.dma_start(S2MM, 2, ^bb7, ^bb9)
-// CHECK:           ^bb7:
-// CHECK:             aie.use_lock(%[[LINK5_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK5_BUFF_0]] : memref<512xi8>) {len = 128 : i32, offset = 256 : i32}
-// CHECK:             aie.use_lock(%[[LINK5_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb8
-// CHECK:           ^bb8:
-// CHECK:             aie.use_lock(%[[LINK5_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK5_BUFF_1]] : memref<512xi8>) {len = 128 : i32, offset = 256 : i32}
-// CHECK:             aie.use_lock(%[[LINK5_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb7
-// CHECK:           ^bb9:
-// CHECK:             %[[VAL_4:.*]] = aie.dma_start(S2MM, 3, ^bb10, ^bb12)
-// CHECK:           ^bb10:
-// CHECK:             aie.use_lock(%[[LINK5_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK5_BUFF_0]] : memref<512xi8>) {len = 128 : i32, offset = 384 : i32}
-// CHECK:             aie.use_lock(%[[LINK5_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb11
-// CHECK:           ^bb11:
-// CHECK:             aie.use_lock(%[[LINK5_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK5_BUFF_1]] : memref<512xi8>) {len = 128 : i32, offset = 384 : i32}
-// CHECK:             aie.use_lock(%[[LINK5_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb10
-// CHECK:           ^bb12:
-// CHECK:             %[[VAL_5:.*]] = aie.dma_start(MM2S, 0, ^bb13, ^bb15)
-// CHECK:           ^bb13:
-// CHECK:             aie.use_lock(%[[LINK5_CONS_LOCK]], AcquireGreaterEqual, 4)
-// CHECK:             aie.dma_bd(%[[LINK5_BUFF_0]] : memref<512xi8>) {len = 512 : i32}
-// CHECK:             aie.use_lock(%[[LINK5_PROD_LOCK]], Release, 4)
-// CHECK:             aie.next_bd ^bb14
-// CHECK:           ^bb14:
-// CHECK:             aie.use_lock(%[[LINK5_CONS_LOCK]], AcquireGreaterEqual, 4)
-// CHECK:             aie.dma_bd(%[[LINK5_BUFF_1]] : memref<512xi8>) {len = 512 : i32}
-// CHECK:             aie.use_lock(%[[LINK5_PROD_LOCK]], Release, 4)
-// CHECK:             aie.next_bd ^bb13
-// CHECK:           ^bb15:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) {
-// CHECK:             %[[VAL_6:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LINK2_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK2_BUFF_0]] : memref<128xi8>) {len = 128 : i32}
-// CHECK:             aie.use_lock(%[[LINK2_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LINK2_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK2_BUFF_1]] : memref<128xi8>) {len = 128 : i32}
-// CHECK:             aie.use_lock(%[[LINK2_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_2_3:.*]] = aie.mem(%[[TILE_2_3]]) {
-// CHECK:             %[[VAL_7:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LINK3_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK3_BUFF_0]] : memref<128xi8>) {len = 128 : i32}
-// CHECK:             aie.use_lock(%[[LINK3_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LINK3_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK3_BUFF_1]] : memref<128xi8>) {len = 128 : i32}
-// CHECK:             aie.use_lock(%[[LINK3_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           aie.shim_dma_allocation @link5(S2MM, 0, 2)
-// CHECK:           %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) {
-// CHECK:             %[[VAL_8:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LINK4_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK4_BUFF_0]] : memref<128xi8>) {len = 128 : i32}
-// CHECK:             aie.use_lock(%[[LINK4_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LINK4_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[LINK4_BUFF_1]] : memref<128xi8>) {len = 128 : i32}
-// CHECK:             aie.use_lock(%[[LINK4_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-module @link_join {
-  aie.device(xcve2302) {
-    %tile20 = aie.tile(2, 0)
-    %tile21 = aie.tile(2, 1)
-    %tile12 = aie.tile(1, 2)
-    %tile22 = aie.tile(2, 2)
-    %tile23 = aie.tile(2, 3)
-    %tile33 = aie.tile(3, 3)
-    aie.flow(%tile12, DMA : 0, %tile21, DMA : 0) {symbol = @link1}
-    aie.flow(%tile22, DMA : 0, %tile21, DMA : 1) {symbol = @link2}
-    aie.flow(%tile23, DMA : 0, %tile21, DMA : 2) {symbol = @link3}
-    aie.flow(%tile33, DMA : 0, %tile21, DMA : 3) {symbol = @link4}
-    aie.flow(%tile21, DMA : 0, %tile20, DMA : 0) {symbol = @link5}
-    aie.objectfifo @link1 (%tile12, {%tile21}, 2 : i32) : !aie.objectfifo<memref<128xi8>>
-    aie.objectfifo @link2 (%tile22, {%tile21}, 2 : i32) : !aie.objectfifo<memref<128xi8>>
-    aie.objectfifo @link3 (%tile23, {%tile21}, 2 : i32) : !aie.objectfifo<memref<128xi8>>
-    aie.objectfifo @link4 (%tile33, {%tile21}, 2 : i32) : !aie.objectfifo<memref<128xi8>>
-    aie.objectfifo @link5 (%tile21, {%tile20}, 2 : i32) : !aie.objectfifo<memref<512xi8>>
-    %ext_buffer_in  = aie.external_buffer {sym_name = "ext_buffer_in"}: memref<512xi8>
-    aie.objectfifo.register_external_buffers @link5 (%tile20, {%ext_buffer_in}) : (memref<512xi8>)
-    aie.objectfifo.link [@link1, @link2, @link3, @link4] -> [@link5] ([] [])
-  }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/matmul_test.mlir b/compiler/plugins/target/AMD-AIE/aie/test/matmul_test.mlir
deleted file mode 100644
index 49bc2bbad..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/matmul_test.mlir
+++ /dev/null
@@ -1,188 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(xcve2302) {
-// CHECK:           memref.global "public" @outC : memref<16x16xi16>
-// CHECK:           memref.global "public" @inB : memref<8x16xi16>
-// CHECK:           memref.global "public" @inA : memref<16x8xi16>
-// CHECK-DAG:       %[[TILE_0_0:.*]] = aie.tile(2, 0)
-// CHECK-DAG:       %[[TILE_0_2:.*]] = aie.tile(2, 2)
-// CHECK-DAG:       %[[OUTC_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_0_0]]) {init = 0 : i8, sym_name = "outC_cons_prod_lock_0"}
-// CHECK-DAG:       %[[OUTC_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_0_0]]) {init = 0 : i8, sym_name = "outC_cons_cons_lock_0"}
-// CHECK-DAG:       %[[OUTC_BUFF_0:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "outC_prod_buff_0_0"} : memref<16x16xi16>
-// CHECK-DAG:       %[[OUTC_BUFF_1:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "outC_prod_buff_0_1"} : memref<16x16xi16>
-// CHECK-DAG:       %[[OUTC_PROD_LOCK:.*]] = aie.lock(%[[TILE_0_2]]) {init = 2 : i8, sym_name = "outC_prod_prod_lock_0"}
-// CHECK-DAG:       %[[OUTC_CONS_LOCK:.*]] = aie.lock(%[[TILE_0_2]]) {init = 0 : i8, sym_name = "outC_prod_cons_lock_0"}
-// CHECK-DAG:       %[[INB_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "inB_cons_buff_0_0"} : memref<8x16xi16>
-// CHECK-DAG:       %[[INB_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "inB_cons_buff_0_1"} : memref<8x16xi16>
-// CHECK-DAG:       %[[INB_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_0_2]]) {init = 2 : i8, sym_name = "inB_cons_prod_lock_0"}
-// CHECK-DAG:       %[[INB_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_0_2]]) {init = 0 : i8, sym_name = "inB_cons_cons_lock_0"}
-// CHECK-DAG:       %[[INB_PROD_LOCK:.*]] = aie.lock(%[[TILE_0_0]]) {init = 0 : i8, sym_name = "inB_prod_prod_lock_0"}
-// CHECK-DAG:       %[[INB_CONS_LOCK:.*]] = aie.lock(%[[TILE_0_0]]) {init = 0 : i8, sym_name = "inB_prod_cons_lock_0"}
-// CHECK-DAG:       %[[INA_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "inA_cons_buff_0_0"} : memref<16x8xi16>
-// CHECK-DAG:       %[[INA_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "inA_cons_buff_0_1"} : memref<16x8xi16>
-// CHECK-DAG:       %[[INA_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_0_2]]) {init = 2 : i8, sym_name = "inA_cons_prod_lock_0"}
-// CHECK-DAG:       %[[INA_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_0_2]]) {init = 0 : i8, sym_name = "inA_cons_cons_lock_0"}
-// CHECK-DAG:       %[[INA_PROD_LOCK:.*]] = aie.lock(%[[TILE_0_0]]) {init = 0 : i8, sym_name = "inA_prod_prod_lock_0"}
-// CHECK-DAG:       %[[INA_CONS_LOCK:.*]] = aie.lock(%[[TILE_0_0]]) {init = 0 : i8, sym_name = "inA_prod_cons_lock_0"}
-// CHECK-DAG:       aie.flow(%[[TILE_0_0]], DMA : 0, %[[TILE_0_2]], DMA : 0)
-// CHECK-DAG:       aie.flow(%[[TILE_0_0]], DMA : 1, %[[TILE_0_2]], DMA : 1)
-// CHECK-DAG:       aie.flow(%[[TILE_0_2]], DMA : 0, %[[TILE_0_0]], DMA : 0)
-// CHECK:           func.func @zero_scalar_i16(%[[ARG0:.*]]: memref<16x16xi16>) {
-// CHECK:             return
-// CHECK:           }
-// CHECK:           func.func @matmul_scalar_i16_i16(%[[ARG0:.*]]: memref<16x8xi16>, %[[ARG1:.*]]: memref<8x16xi16>, %[[ARG2:.*]]: memref<16x16xi16>) {
-// CHECK:             return
-// CHECK:           }
-// CHECK:           aie.shim_dma_allocation @inA(MM2S, 0, 2)
-// CHECK:           %[[CORE_0_2:.*]] = aie.core(%[[TILE_0_2]]) {
-// CHECK-DAG:         %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:         %[[C1:.*]] = arith.constant 1 : index
-// CHECK-DAG:         %[[C2:.*]] = arith.constant 2 : index
-// CHECK-DAG:         %[[C4:.*]] = arith.constant 4 : index
-// CHECK-DAG:         %[[C4294967295:.*]] = arith.constant 4294967295 : index
-// CHECK:             scf.for %[[ARG0:.*]] = %[[C0]] to %[[C4294967295]] step %[[C1]] {
-// CHECK:               scf.for %[[ARG1:.*]] = %[[C0]] to %[[C4]] step %[[C2]] {
-// CHECK:                 aie.use_lock(%[[OUTC_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:                 func.call @zero_scalar_i16(%[[OUTC_BUFF_0]]) : (memref<16x16xi16>) -> ()
-// CHECK:                 scf.for %[[ARG2:.*]] = %[[C0]] to %[[C4]] step %[[C2]] {
-// CHECK:                   aie.use_lock(%[[INA_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:                   aie.use_lock(%[[INB_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:                   func.call @matmul_scalar_i16_i16(%[[INA_CONS_BUFF_0]], %[[INB_CONS_BUFF_0]], %[[OUTC_BUFF_0]]) : (memref<16x8xi16>, memref<8x16xi16>, memref<16x16xi16>) -> ()
-// CHECK:                   aie.use_lock(%[[INA_CONS_PROD_LOCK]], Release, 1)
-// CHECK:                   aie.use_lock(%[[INB_CONS_PROD_LOCK]], Release, 1)
-// CHECK:                   aie.use_lock(%[[INA_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:                   aie.use_lock(%[[INB_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:                   func.call @matmul_scalar_i16_i16(%[[INA_CONS_BUFF_1]], %[[INB_CONS_BUFF_1]], %[[OUTC_BUFF_0]]) : (memref<16x8xi16>, memref<8x16xi16>, memref<16x16xi16>) -> ()
-// CHECK:                   aie.use_lock(%[[INA_CONS_PROD_LOCK]], Release, 1)
-// CHECK:                   aie.use_lock(%[[INB_CONS_PROD_LOCK]], Release, 1)
-// CHECK:                 }
-// CHECK:                 aie.use_lock(%[[OUTC_CONS_LOCK]], Release, 1)
-// CHECK:                 aie.use_lock(%[[OUTC_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:                 func.call @zero_scalar_i16(%[[OUTC_BUFF_1]]) : (memref<16x16xi16>) -> ()
-// CHECK:                 scf.for %[[ARG2:.*]] = %[[C0]] to %[[C4]] step %[[C2]] {
-// CHECK:                   aie.use_lock(%[[INA_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:                   aie.use_lock(%[[INB_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:                   func.call @matmul_scalar_i16_i16(%[[INA_CONS_BUFF_0]], %[[INB_CONS_BUFF_0]], %[[OUTC_BUFF_1]]) : (memref<16x8xi16>, memref<8x16xi16>, memref<16x16xi16>) -> ()
-// CHECK:                   aie.use_lock(%[[INA_CONS_PROD_LOCK]], Release, 1)
-// CHECK:                   aie.use_lock(%[[INB_CONS_PROD_LOCK]], Release, 1)
-// CHECK:                   aie.use_lock(%[[INA_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:                   aie.use_lock(%[[INB_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:                   func.call @matmul_scalar_i16_i16(%[[INA_CONS_BUFF_1]], %[[INB_CONS_BUFF_1]], %[[OUTC_BUFF_1]]) : (memref<16x8xi16>, memref<8x16xi16>, memref<16x16xi16>) -> ()
-// CHECK:                   aie.use_lock(%[[INA_CONS_PROD_LOCK]], Release, 1)
-// CHECK:                   aie.use_lock(%[[INB_CONS_PROD_LOCK]], Release, 1)
-// CHECK:                 }
-// CHECK:                 aie.use_lock(%[[OUTC_CONS_LOCK]], Release, 1)
-// CHECK:               }
-// CHECK:             }
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           aie.shim_dma_allocation @inB(MM2S, 1, 2)
-// CHECK:           aie.shim_dma_allocation @outC(S2MM, 0, 2)
-// CHECK:           %[[MEM_0_2:.*]] = aie.mem(%[[TILE_0_2]]) {
-// CHECK:             %[[VAL_0:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[INA_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[INA_CONS_BUFF_0]] : memref<16x8xi16>) {len = 128 : i32}
-// CHECK:             aie.use_lock(%[[INA_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[INA_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[INA_CONS_BUFF_1]] : memref<16x8xi16>) {len = 128 : i32}
-// CHECK:             aie.use_lock(%[[INA_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(S2MM, 1, ^bb4, ^bb6)
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[INB_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[INB_CONS_BUFF_0]] : memref<8x16xi16>) {len = 128 : i32}
-// CHECK:             aie.use_lock(%[[INB_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb5
-// CHECK:           ^bb5:
-// CHECK:             aie.use_lock(%[[INB_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[INB_CONS_BUFF_1]] : memref<8x16xi16>) {len = 128 : i32}
-// CHECK:             aie.use_lock(%[[INB_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb6:
-// CHECK:             %[[VAL_2:.*]] = aie.dma_start(MM2S, 0, ^bb7, ^bb9)
-// CHECK:           ^bb7:
-// CHECK:             aie.use_lock(%[[OUTC_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OUTC_BUFF_0]] : memref<16x16xi16>) {len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OUTC_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb8
-// CHECK:           ^bb8:
-// CHECK:             aie.use_lock(%[[OUTC_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OUTC_BUFF_1]] : memref<16x16xi16>) {len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OUTC_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb7
-// CHECK:           ^bb9:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-
-module @matmul {
-  aie.device(xcve2302) {
-    %t00 = aie.tile(2, 0)
-    %t02 = aie.tile(2, 2)
-    aie.flow(%t00, DMA : 0, %t02, DMA : 0) {symbol = @inA}
-    aie.flow(%t00, DMA : 1, %t02, DMA : 1) {symbol = @inB}
-    aie.flow(%t02, DMA : 0, %t00, DMA : 0) {symbol = @outC}
-    aie.objectfifo @inA  (%t00, { %t02 }, 2 : i32) : !aie.objectfifo<memref<16x8xi16>>
-    aie.objectfifo @inB  (%t00, { %t02 }, 2 : i32) : !aie.objectfifo<memref<8x16xi16>>
-    aie.objectfifo @outC (%t02, { %t00 }, 2 : i32) : !aie.objectfifo<memref<16x16xi16>>
-    func.func @zero_scalar_i16(%elem0 : memref<16x16xi16>) -> () { return }
-    func.func @matmul_scalar_i16_i16(%elem0 : memref<16x8xi16>, %elem1 : memref<8x16xi16>, %elem2 : memref<16x16xi16>) -> () { return }
-    aie.core(%t02) {
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 1 : index
-      %c2 = arith.constant 2 : index
-      %c4 = arith.constant 4 : index
-      %intmax = arith.constant 0xFFFFFFFF : index
-      scf.for %reps = %c0 to %intmax step %c1 {
-        scf.for %arg2 = %c0 to %c4 step %c2 {
-          %subview4 = aie.objectfifo.acquire @outC (Produce, 1) : !aie.objectfifosubview<memref<16x16xi16>>
-          %elem4 = aie.objectfifo.subview.access %subview4[0] : !aie.objectfifosubview<memref<16x16xi16>> -> memref<16x16xi16>
-          func.call @zero_scalar_i16(%elem4) : (memref<16x16xi16>) -> ()
-          scf.for %arg3 = %c0 to %c4 step %c2 {
-            %subview0 = aie.objectfifo.acquire @inA (Consume, 1) : !aie.objectfifosubview<memref<16x8xi16>>
-            %elem0 = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview<memref<16x8xi16>> -> memref<16x8xi16>
-            %subview1 = aie.objectfifo.acquire @inB (Consume, 1) : !aie.objectfifosubview<memref<8x16xi16>>
-            %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<8x16xi16>> -> memref<8x16xi16>
-            func.call @matmul_scalar_i16_i16(%elem0, %elem1, %elem4) : (memref<16x8xi16>, memref<8x16xi16>, memref<16x16xi16>) -> ()
-            aie.objectfifo.release @inA (Consume, 1)
-            aie.objectfifo.release @inB (Consume, 1)
-            %subview2 = aie.objectfifo.acquire @inA (Consume, 1) : !aie.objectfifosubview<memref<16x8xi16>>
-            %elem2 = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview<memref<16x8xi16>> -> memref<16x8xi16>
-            %subview3 = aie.objectfifo.acquire @inB (Consume, 1) : !aie.objectfifosubview<memref<8x16xi16>>
-            %elem3 = aie.objectfifo.subview.access %subview3[0] : !aie.objectfifosubview<memref<8x16xi16>> -> memref<8x16xi16>
-            func.call @matmul_scalar_i16_i16(%elem2, %elem3, %elem4) : (memref<16x8xi16>, memref<8x16xi16>, memref<16x16xi16>) -> ()
-            aie.objectfifo.release @inA (Consume, 1)
-            aie.objectfifo.release @inB (Consume, 1)
-          }
-          aie.objectfifo.release @outC (Produce, 1)
-          %subview5 = aie.objectfifo.acquire @outC (Produce, 1) : !aie.objectfifosubview<memref<16x16xi16>>
-          %elem5 = aie.objectfifo.subview.access %subview5[0] : !aie.objectfifosubview<memref<16x16xi16>> -> memref<16x16xi16>
-          func.call @zero_scalar_i16(%elem5) : (memref<16x16xi16>) -> ()
-          scf.for %arg3 = %c0 to %c4 step %c2 {
-            %subview0 = aie.objectfifo.acquire @inA (Consume, 1) : !aie.objectfifosubview<memref<16x8xi16>>
-            %elem0 = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview<memref<16x8xi16>> -> memref<16x8xi16>
-            %subview1 = aie.objectfifo.acquire @inB (Consume, 1) : !aie.objectfifosubview<memref<8x16xi16>>
-            %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<8x16xi16>> -> memref<8x16xi16>
-            func.call @matmul_scalar_i16_i16(%elem0, %elem1, %elem5) : (memref<16x8xi16>, memref<8x16xi16>, memref<16x16xi16>) -> ()
-            aie.objectfifo.release @inA (Consume, 1)
-            aie.objectfifo.release @inB (Consume, 1)
-            %subview2 = aie.objectfifo.acquire @inA (Consume, 1) : !aie.objectfifosubview<memref<16x8xi16>>
-            %elem2 = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview<memref<16x8xi16>> -> memref<16x8xi16>
-            %subview3 = aie.objectfifo.acquire @inB (Consume, 1) : !aie.objectfifosubview<memref<8x16xi16>>
-            %elem3 = aie.objectfifo.subview.access %subview3[0] : !aie.objectfifosubview<memref<8x16xi16>> -> memref<8x16xi16>
-            func.call @matmul_scalar_i16_i16(%elem2, %elem3, %elem5) : (memref<16x8xi16>, memref<8x16xi16>, memref<16x16xi16>) -> ()
-            aie.objectfifo.release @inA (Consume, 1)
-            aie.objectfifo.release @inB (Consume, 1)
-          }
-          aie.objectfifo.release @outC (Produce, 1)
-        }
-      }
-      aie.end
-    }
-  }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/memTile_test.mlir b/compiler/plugins/target/AMD-AIE/aie/test/memTile_test.mlir
deleted file mode 100644
index c13c13319..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/memTile_test.mlir
+++ /dev/null
@@ -1,55 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(xcve2302) {
-// CHECK:           memref.global "public" @of : memref<16xi32>
-// CHECK-DAG:       %[[TILE_2_1:.*]] = aie.tile(2, 1)
-// CHECK-DAG:       %[[TILE_2_2:.*]] = aie.tile(2, 2)
-// CHECK-DAG:       %[[OF_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of_cons_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[OF_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of_cons_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[OF_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "of_cons_prod_lock_0"}
-// CHECK-DAG:       %[[OF_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "of_cons_cons_lock_0"}
-// CHECK-DAG:       %[[OF_BUFF_0:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "of_prod_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[OF_BUFF_1:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "of_prod_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[OF_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 2 : i8, sym_name = "of_prod_prod_lock_0"}
-// CHECK-DAG:       %[[OF_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 0 : i8, sym_name = "of_prod_cons_lock_0"}
-// CHECK-DAG:       aie.flow(%[[TILE_2_1]], DMA : 0, %[[TILE_2_2]], DMA : 0)
-// CHECK:           %[[MEMTILE_DMA_2_1:.*]] = aie.memtile_dma(%[[TILE_2_1]]) {
-// CHECK:             %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OF_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF_BUFF_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OF_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF_BUFF_1]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) {
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OF_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OF_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-module @memTile {
-  aie.device(xcve2302) {
-    %tile11 = aie.tile(2, 1)
-    %tile12 = aie.tile(2, 2)
-    aie.flow(%tile11, DMA : 0, %tile12, DMA : 0) {symbol = @of}
-    aie.objectfifo @of (%tile11, {%tile12}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
-  }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/nd_dma_base_AIE2.mlir b/compiler/plugins/target/AMD-AIE/aie/test/nd_dma_base_AIE2.mlir
deleted file mode 100644
index f8b66fb1e..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/nd_dma_base_AIE2.mlir
+++ /dev/null
@@ -1,126 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(xcve2302) {
-// CHECK:           memref.global "public" @of1 : memref<256xi32>
-// CHECK:           memref.global "public" @of0 : memref<256xi32>
-// CHECK-DAG:       %[[TILE_1_2:.*]] = aie.tile(1, 2)
-// CHECK-DAG:       %[[TILE_1_3:.*]] = aie.tile(1, 3)
-// CHECK-DAG:       %[[TILE_3_3:.*]] = aie.tile(3, 3)
-// CHECK-DAG:       %[[OF1_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of1_cons_buff_0_0"} : memref<256xi32>
-// CHECK-DAG:       %[[OF1_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of1_cons_buff_0_1"} : memref<256xi32>
-// CHECK-DAG:       %[[OF1_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "of1_cons_prod_lock_0"}
-// CHECK-DAG:       %[[OF1_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "of1_cons_cons_lock_0"}
-// CHECK-DAG:       %[[OF1_BUFF_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of1_prod_buff_0_0"} : memref<256xi32>
-// CHECK-DAG:       %[[OF1_BUFF_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of1_prod_buff_0_1"} : memref<256xi32>
-// CHECK-DAG:       %[[OF1_PROD_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 2 : i8, sym_name = "of1_prod_prod_lock_0"}
-// CHECK-DAG:       %[[OF1_CONS_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of1_prod_cons_lock_0"}
-// CHECK-DAG:       %[[OF0_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_0"} : memref<256xi32>
-// CHECK-DAG:       %[[OF0_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_1"} : memref<256xi32>
-// CHECK-DAG:       %[[OF0_CONS_BUFF_2:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_2"} : memref<256xi32>
-// CHECK-DAG:       %[[OF0_CONS_BUFF_3:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_3"} : memref<256xi32>
-// CHECK-DAG:       %[[OF0_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_1_3]]) {init = 4 : i8, sym_name = "of0_cons_prod_lock_0"}
-// CHECK-DAG:       %[[OF0_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_1_3]]) {init = 0 : i8, sym_name = "of0_cons_cons_lock_0"}
-// CHECK-DAG:       %[[OF0_BUFF_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_0"} : memref<256xi32>
-// CHECK-DAG:       %[[OF0_BUFF_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_1"} : memref<256xi32>
-// CHECK-DAG:       %[[OF0_BUFF_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_2"} : memref<256xi32>
-// CHECK-DAG:       %[[OF0_BUFF_3:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_3"} : memref<256xi32>
-// CHECK-DAG:       %[[OF0_PROD_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 4 : i8, sym_name = "of0_prod_prod_lock_0"}
-// CHECK-DAG:       %[[OF0_CONS_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of0_prod_cons_lock_0"}
-// CHECK-DAG:       aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_1_3]], DMA : 0)
-// CHECK-DAG:       aie.flow(%[[TILE_1_2]], DMA : 1, %[[TILE_3_3]], DMA : 0)
-// CHECK:           %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) {
-// CHECK:             %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb5)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OF0_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_BUFF_0]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 16, stride = 1>, <size = 16, stride = 16>, <size = 1, stride = 1>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF0_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OF0_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_BUFF_1]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 16, stride = 1>, <size = 16, stride = 16>, <size = 1, stride = 1>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF0_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[OF0_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_BUFF_2]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 16, stride = 1>, <size = 16, stride = 16>, <size = 1, stride = 1>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF0_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[OF0_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_BUFF_3]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 16, stride = 1>, <size = 16, stride = 16>, <size = 1, stride = 1>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF0_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb5:
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(MM2S, 1, ^bb6, ^bb8)
-// CHECK:           ^bb6:
-// CHECK:             aie.use_lock(%[[OF1_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF1_BUFF_0]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 128, stride = 2>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF1_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb7
-// CHECK:           ^bb7:
-// CHECK:             aie.use_lock(%[[OF1_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF1_BUFF_1]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 128, stride = 2>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF1_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb6
-// CHECK:           ^bb8:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_1_3:.*]] = aie.mem(%[[TILE_1_3]]) {
-// CHECK:             %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OF0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_CONS_BUFF_0]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 1, stride = 1>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF0_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OF0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_CONS_BUFF_1]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 1, stride = 1>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF0_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[OF0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_CONS_BUFF_2]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 1, stride = 1>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF0_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[OF0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_CONS_BUFF_3]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 1, stride = 1>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF0_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb5:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) {
-// CHECK:             %[[VAL_3:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OF1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF1_CONS_BUFF_0]] : memref<256xi32>) {len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF1_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OF1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF1_CONS_BUFF_1]] : memref<256xi32>) {len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF1_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-module @ndDMAObjFifoAIE2 {
- aie.device(xcve2302) {
-    %tile12 = aie.tile(1, 2)
-    %tile13 = aie.tile(1, 3)
-    %tile33 = aie.tile(3, 3)
-    // Even if an objectFifo could be implemented in shared memory, as with
-    // this case between two adjacent tiles, we need to use DMAs if a data
-    // layout transformation with toStream and fromStream was specified.
-    aie.flow(%tile12, DMA : 0, %tile13, DMA : 0) {symbol = @of0}
-    aie.flow(%tile12, DMA : 1, %tile33, DMA : 0) {symbol = @of1}
-    aie.objectfifo @of0 (%tile12 toStream [<size = 16, stride = 1>, <size = 16, stride = 16>, <size = 1, stride = 1>], // transpose
-                         {%tile13 fromStream [<size = 1, stride = 1>]},
-                         4 : i32) : !aie.objectfifo<memref<256xi32>>
-    aie.objectfifo @of1 (%tile12 toStream [<size = 128, stride = 2>], {%tile33},
-                         2 : i32) : !aie.objectfifo<memref<256xi32>>
- }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/nd_dma_distribute_AIE2.mlir b/compiler/plugins/target/AMD-AIE/aie/test/nd_dma_distribute_AIE2.mlir
deleted file mode 100644
index 87d830ca9..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/nd_dma_distribute_AIE2.mlir
+++ /dev/null
@@ -1,123 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(xcve2302) {
-// CHECK:           memref.global "public" @of2 : memref<128xi32>
-// CHECK:           memref.global "public" @of1 : memref<128xi32>
-// CHECK:           memref.global "public" @of0 : memref<256xi32>
-// CHECK-DAG:       %[[TILE_2_0:.*]] = aie.tile(2, 0)
-// CHECK-DAG:       %[[TILE_2_1:.*]] = aie.tile(2, 1)
-// CHECK-DAG:       %[[TILE_3_2:.*]] = aie.tile(3, 2)
-// CHECK-DAG:       %[[TILE_3_3:.*]] = aie.tile(3, 3)
-// CHECK-DAG:       %[[OF2_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of2_cons_buff_1_0"} : memref<128xi32>
-// CHECK-DAG:       %[[OF2_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of2_cons_buff_1_1"} : memref<128xi32>
-// CHECK-DAG:       %[[OF2_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "of2_cons_prod_lock_1"}
-// CHECK-DAG:       %[[OF2_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "of2_cons_cons_lock_1"}
-// CHECK-DAG:       %[[OF1_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_3_2]]) {sym_name = "of1_cons_buff_0_0"} : memref<128xi32>
-// CHECK-DAG:       %[[OF1_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_3_2]]) {sym_name = "of1_cons_buff_0_1"} : memref<128xi32>
-// CHECK-DAG:       %[[OF1_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_3_2]]) {init = 2 : i8, sym_name = "of1_cons_prod_lock_0"}
-// CHECK-DAG:       %[[OF1_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_3_2]]) {init = 0 : i8, sym_name = "of1_cons_cons_lock_0"}
-// CHECK-DAG:       %[[OF0_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "of0_link_buff_0_0"} : memref<256xi32>
-// CHECK-DAG:       %[[OF0_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_1]]) {sym_name = "of0_link_buff_0_1"} : memref<256xi32>
-// CHECK-DAG:       %[[OF0_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 4 : i8, sym_name = "of0_link_prod_lock_0"}
-// CHECK-DAG:       %[[OF0_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_1]]) {init = 0 : i8, sym_name = "of0_link_cons_lock_0"}
-// CHECK-DAG:       %[[OF0_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "of0_prod_prod_lock_0"}
-// CHECK-DAG:       %[[OF0_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "of0_prod_cons_lock_0"}
-// CHECK-DAG:       aie.flow(%[[TILE_2_0]], DMA : 0, %[[TILE_2_1]], DMA : 0)
-// CHECK-DAG:       aie.flow(%[[TILE_2_1]], DMA : 0, %[[TILE_3_2]], DMA : 0)
-// CHECK-DAG:       aie.flow(%[[TILE_2_1]], DMA : 1, %[[TILE_3_3]], DMA : 0)
-// CHECK-DAG        aie.shim_dma_allocation @of0(MM2S, 0, 2)
-// CHECK:           %[[MEMTILE_DMA_1_1:.*]] = aie.memtile_dma(%[[TILE_2_1]]) {
-// CHECK:             %[[VAL_0:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OF0_CONS_PROD_LOCK]], AcquireGreaterEqual, 2)
-// CHECK:             aie.dma_bd(%[[OF0_CONS_BUFF_0]] : memref<256xi32>) {len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF0_CONS_CONS_LOCK]], Release, 2)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OF0_CONS_PROD_LOCK]], AcquireGreaterEqual, 2)
-// CHECK:             aie.dma_bd(%[[OF0_CONS_BUFF_1]] : memref<256xi32>) {len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF0_CONS_CONS_LOCK]], Release, 2)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(MM2S, 0, ^bb4, ^bb6)
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[OF0_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_CONS_BUFF_0]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 4, stride = 64>, <size = 2, stride = 4>, <size = 8, stride = 8>, <size = 4, stride = 1>]>, len = 128 : i32}
-// CHECK:             aie.use_lock(%[[OF0_CONS_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb5
-// CHECK:           ^bb5:
-// CHECK:             aie.use_lock(%[[OF0_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_CONS_BUFF_1]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 4, stride = 64>, <size = 2, stride = 4>, <size = 8, stride = 8>, <size = 4, stride = 1>]>, len = 128 : i32}
-// CHECK:             aie.use_lock(%[[OF0_CONS_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb6:
-// CHECK:             %[[VAL_2:.*]] = aie.dma_start(MM2S, 1, ^bb7, ^bb9)
-// CHECK:           ^bb7:
-// CHECK:             aie.use_lock(%[[OF0_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_CONS_BUFF_0]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 4, stride = 64>, <size = 2, stride = 4>, <size = 8, stride = 8>, <size = 4, stride = 1>]>, len = 128 : i32, offset = 128 : i32}
-// CHECK:             aie.use_lock(%[[OF0_CONS_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb8
-// CHECK:           ^bb8:
-// CHECK:             aie.use_lock(%[[OF0_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_CONS_BUFF_1]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 4, stride = 64>, <size = 2, stride = 4>, <size = 8, stride = 8>, <size = 4, stride = 1>]>, len = 128 : i32, offset = 128 : i32}
-// CHECK:             aie.use_lock(%[[OF0_CONS_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb7
-// CHECK:           ^bb9:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_2_2:.*]] = aie.mem(%[[TILE_3_2]]) {
-// CHECK:             %[[VAL_3:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OF1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF1_CONS_BUFF_0]] : memref<128xi32>) {len = 128 : i32}
-// CHECK:             aie.use_lock(%[[OF1_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OF1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF1_CONS_BUFF_1]] : memref<128xi32>) {len = 128 : i32}
-// CHECK:             aie.use_lock(%[[OF1_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_2_3:.*]] = aie.mem(%[[TILE_3_3]]) {
-// CHECK:             %[[VAL_4:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OF2_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF2_CONS_BUFF_0]] : memref<128xi32>) {len = 128 : i32}
-// CHECK:             aie.use_lock(%[[OF2_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OF2_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF2_CONS_BUFF_1]] : memref<128xi32>) {len = 128 : i32}
-// CHECK:             aie.use_lock(%[[OF2_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-module @ndDMAObjFifoAIE2 {
- aie.device(xcve2302) {
-    %tile10 = aie.tile(2, 0)
-    %tile11 = aie.tile(2, 1)
-    %tile22 = aie.tile(3, 2)
-    %tile23 = aie.tile(3, 3)
-    aie.flow(%tile10, DMA : 0, %tile11, DMA : 0) {symbol = @of0}
-    aie.flow(%tile11, DMA : 0, %tile22, DMA : 0) {symbol = @of1}
-    aie.flow(%tile11, DMA : 1, %tile23, DMA : 0) {symbol = @of2}
-    aie.objectfifo @of0 (%tile10, {%tile11},
-                         2 : i32) : !aie.objectfifo<memref<256xi32>>
-    aie.objectfifo @of1 (%tile11 toStream [<size = 4, stride = 64>,
-                                           <size = 2, stride = 4>,
-                                           <size = 8, stride = 8>,
-                                           <size = 4, stride = 1>],
-                        {%tile22}, 2 : i32) : !aie.objectfifo<memref<128xi32>>
-    aie.objectfifo @of2 (%tile11 toStream [<size = 4, stride = 64>,
-                                           <size = 2, stride = 4>,
-                                           <size = 8, stride = 8>,
-                                           <size = 4, stride = 1>],
-                        {%tile23}, 2 : i32) : !aie.objectfifo<memref<128xi32>>
-   aie.objectfifo.link [ @of0 ] -> [ @of1, @of2 ] ([] [])
- }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/nd_dma_multiple_consumers_AIE2.mlir b/compiler/plugins/target/AMD-AIE/aie/test/nd_dma_multiple_consumers_AIE2.mlir
deleted file mode 100644
index e3cbd0ca8..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/nd_dma_multiple_consumers_AIE2.mlir
+++ /dev/null
@@ -1,201 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(xcve2302) {
-// CHECK:           memref.global "public" @of3 : memref<256xi32>
-// CHECK:           memref.global "public" @of1 : memref<256xi32>
-// CHECK:           memref.global "public" @of0 : memref<256xi32>
-// CHECK-DAG:       %[[TILE_1_2:.*]] = aie.tile(1, 2)
-// CHECK-DAG:       %[[TILE_1_3:.*]] = aie.tile(1, 3)
-// CHECK-DAG:       %[[TILE_3_3:.*]] = aie.tile(3, 3)
-// CHECK-DAG:       %[[TILE_2_2:.*]] = aie.tile(2, 2)
-// CHECK-DAG:       %[[TILE_2_3:.*]] = aie.tile(2, 3)
-// CHECK-DAG:       %[[OF3_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "of3_cons_buff_0_0"} : memref<256xi32>
-// CHECK-DAG:       %[[OF3_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "of3_cons_buff_0_1"} : memref<256xi32>
-// CHECK-DAG:       %[[OF3_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_3]]) {init = 2 : i8, sym_name = "of3_cons_prod_lock_0"}
-// CHECK-DAG:       %[[OF3_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_3]]) {init = 0 : i8, sym_name = "of3_cons_cons_lock_0"}
-// CHECK-DAG:       %[[OF3_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of3_prod_buff_0_0"} : memref<256xi32>
-// CHECK-DAG:       %[[OF3_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of3_prod_buff_0_1"} : memref<256xi32>
-// CHECK-DAG:       %[[OF3_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "of3_prod_prod_lock_0"}
-// CHECK-DAG:       %[[OF3_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "of3_prod_cons_lock_0"}
-// CHECK-DAG:       %[[OF1_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of1_cons_buff_0_0"} : memref<256xi32>
-// CHECK-DAG:       %[[OF1_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of1_cons_buff_0_1"} : memref<256xi32>
-// CHECK-DAG:       %[[OF1_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "of1_cons_prod_lock_0"}
-// CHECK-DAG:       %[[OF1_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "of1_cons_cons_lock_0"}
-// CHECK-DAG:       %[[OF1_BUFF_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of1_prod_buff_0_0"} : memref<256xi32>
-// CHECK-DAG:       %[[OF1_BUFF_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of1_prod_buff_0_1"} : memref<256xi32>
-// CHECK-DAG:       %[[OF1_PROD_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 2 : i8, sym_name = "of1_prod_prod_lock_0"}
-// CHECK-DAG:       %[[OF1_CONS_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of1_prod_cons_lock_0"}
-// CHECK-DAG:       %[[OF0_0_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_0"} : memref<256xi32>
-// CHECK-DAG:       %[[OF0_0_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_1"} : memref<256xi32>
-// CHECK-DAG:       %[[OF0_0_CONS_BUFF_2:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_2"} : memref<256xi32>
-// CHECK-DAG:       %[[OF0_0_CONS_BUFF_3:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "of0_cons_buff_0_3"} : memref<256xi32>
-// CHECK-DAG:       %[[OF0_0_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_1_3]]) {init = 4 : i8, sym_name = "of0_cons_prod_lock_0"}
-// CHECK-DAG:       %[[OF0_0_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_1_3]]) {init = 0 : i8, sym_name = "of0_cons_cons_lock_0"}
-// CHECK-DAG:       %[[OF0_1_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of0_cons_buff_1_0"} : memref<256xi32>
-// CHECK-DAG:       %[[OF0_1_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of0_cons_buff_1_1"} : memref<256xi32>
-// CHECK-DAG:       %[[OF0_1_CONS_BUFF_2:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of0_cons_buff_1_2"} : memref<256xi32>
-// CHECK-DAG:       %[[OF0_1_CONS_BUFF_3:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of0_cons_buff_1_3"} : memref<256xi32>
-// CHECK-DAG:       %[[OF0_1_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 4 : i8, sym_name = "of0_cons_prod_lock_1"}
-// CHECK-DAG:       %[[OF0_1_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "of0_cons_cons_lock_1"}
-// CHECK-DAG:       %[[OF0_BUFF_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_0"} : memref<256xi32>
-// CHECK-DAG:       %[[OF0_BUFF_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_1"} : memref<256xi32>
-// CHECK-DAG:       %[[OF0_BUFF_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_2"} : memref<256xi32>
-// CHECK-DAG:       %[[OF0_BUFF_3:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of0_prod_buff_0_3"} : memref<256xi32>
-// CHECK-DAG:       %[[OF0_PROD_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 4 : i8, sym_name = "of0_prod_prod_lock_0"}
-// CHECK-DAG:       %[[OF0_CONS_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of0_prod_cons_lock_0"}
-// CHECK-DAG:       aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_3_3]], DMA : 0)
-// CHECK-DAG:       aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_1_3]], DMA : 0)
-// CHECK-DAG:       aie.flow(%[[TILE_1_2]], DMA : 1, %[[TILE_3_3]], DMA : 1)
-// CHECK-DAG:       aie.flow(%[[TILE_2_2]], DMA : 0, %[[TILE_2_3]], DMA : 0)
-// CHECK:           %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) {
-// CHECK:             %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb5)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OF0_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_BUFF_0]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 16, stride = 1>, <size = 16, stride = 16>, <size = 1, stride = 1>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF0_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OF0_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_BUFF_1]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 16, stride = 1>, <size = 16, stride = 16>, <size = 1, stride = 1>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF0_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[OF0_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_BUFF_2]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 16, stride = 1>, <size = 16, stride = 16>, <size = 1, stride = 1>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF0_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[OF0_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_BUFF_3]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 16, stride = 1>, <size = 16, stride = 16>, <size = 1, stride = 1>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF0_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb5:
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(MM2S, 1, ^bb6, ^bb8)
-// CHECK:           ^bb6:
-// CHECK:             aie.use_lock(%[[OF1_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF1_BUFF_0]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 128, stride = 2>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF1_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb7
-// CHECK:           ^bb7:
-// CHECK:             aie.use_lock(%[[OF1_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF1_BUFF_1]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 128, stride = 2>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF1_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb6
-// CHECK:           ^bb8:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_1_3:.*]] = aie.mem(%[[TILE_1_3]]) {
-// CHECK:             %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OF0_0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_0_CONS_BUFF_0]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 1, stride = 1>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF0_0_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OF0_0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_0_CONS_BUFF_1]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 1, stride = 1>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF0_0_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[OF0_0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_0_CONS_BUFF_2]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 1, stride = 1>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF0_0_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[OF0_0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_0_CONS_BUFF_3]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 1, stride = 1>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF0_0_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb5:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) {
-// CHECK:             %[[VAL_3:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OF0_1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_1_CONS_BUFF_0]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 3, stride = 4>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF0_1_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OF0_1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_1_CONS_BUFF_1]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 3, stride = 4>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF0_1_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[OF0_1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_1_CONS_BUFF_2]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 3, stride = 4>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF0_1_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[OF0_1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF0_1_CONS_BUFF_3]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 3, stride = 4>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF0_1_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb5:
-// CHECK:             %[[VAL_4:.*]] = aie.dma_start(S2MM, 1, ^bb6, ^bb8)
-// CHECK:           ^bb6:
-// CHECK:             aie.use_lock(%[[OF1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF1_CONS_BUFF_0]] : memref<256xi32>) {len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF1_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb7
-// CHECK:           ^bb7:
-// CHECK:             aie.use_lock(%[[OF1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF1_CONS_BUFF_1]] : memref<256xi32>) {len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF1_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb6
-// CHECK:           ^bb8:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) {
-// CHECK:             %[[VAL_5:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OF3_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF3_BUFF_0]] : memref<256xi32>) {len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF3_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OF3_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF3_BUFF_1]] : memref<256xi32>) {len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF3_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_2_3:.*]] = aie.mem(%[[TILE_2_3]]) {
-// CHECK:             %[[VAL_6:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OF3_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF3_CONS_BUFF_0]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 9, stride = 9>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF3_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OF3_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF3_CONS_BUFF_1]] : memref<256xi32>) {dimensions = #aie<bd_dim_layout_array[<size = 9, stride = 9>]>, len = 256 : i32}
-// CHECK:             aie.use_lock(%[[OF3_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-module @ndDMAObjFifoAIE2 {
-  aie.device(xcve2302) {
-    %tile12 = aie.tile(1, 2)
-    %tile13 = aie.tile(1, 3)
-    %tile33 = aie.tile(3, 3)
-    %tile22 = aie.tile(2, 2)
-    %tile23 = aie.tile(2, 3)
-    aie.flow(%tile12, DMA : 0, %tile33, DMA : 0) {symbol = @of0}
-    aie.flow(%tile12, DMA : 0, %tile13, DMA : 0) {symbol = @of0}
-    aie.flow(%tile12, DMA : 1, %tile33, DMA : 1) {symbol = @of1}
-    aie.flow(%tile22, DMA : 0, %tile23, DMA : 0) {symbol = @of3}
-    aie.objectfifo @of0 (%tile12 toStream [<size = 16, stride = 1>, <size = 16, stride = 16>, <size = 1, stride = 1>], // transpose
-                          {%tile13 fromStream [<size = 1, stride = 1>],
-                          %tile33 fromStream [<size = 3, stride = 4>]},
-                          4 : i32) : !aie.objectfifo<memref<256xi32>>
-    aie.objectfifo @of1 (%tile12 toStream [<size = 128, stride = 2>], {%tile33},
-                          2 : i32) : !aie.objectfifo<memref<256xi32>>
-    aie.objectfifo @of3 (%tile22, {%tile23 fromStream [<size = 9, stride = 9>]},
-                          2 : i32) : !aie.objectfifo<memref<256xi32>>
-  }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/nested_loop_test.mlir b/compiler/plugins/target/AMD-AIE/aie/test/nested_loop_test.mlir
deleted file mode 100644
index 06a3be4da..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/nested_loop_test.mlir
+++ /dev/null
@@ -1,365 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(npu1_4col) {
-// CHECK:           memref.global "public" @in8 : memref<32x32xi32>
-// CHECK:           memref.global "public" @in7 : memref<64x32xi32>
-// CHECK:           memref.global "public" @in2 : memref<32x64xi32>
-// CHECK-DAG:       %[[TILE_0_1:.*]] = aie.tile(0, 1)
-// CHECK-DAG:       %[[TILE_1_2:.*]] = aie.tile(1, 2)
-// CHECK-DAG:       %[[TILE_0_2:.*]] = aie.tile(0, 2)
-// CHECK-DAG:       %[[BUFFER_1_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in8_prod_buff_0_0"} : memref<32x32xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in8_prod_buff_0_1"} : memref<32x32xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in8_prod_buff_0_2"} : memref<32x32xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in8_prod_buff_0_3"} : memref<32x32xi32>
-// CHECK-DAG:       %[[LOCK_1_2:.*]] = aie.lock(%[[TILE_1_2]]) {init = 4 : i8, sym_name = "in8_prod_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_1_2_3:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "in8_prod_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_0_1:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in8_cons_buff_0_0"} : memref<32x32xi32>
-// CHECK-DAG:       %[[BUFFER_0_1_4:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in8_cons_buff_0_1"} : memref<32x32xi32>
-// CHECK-DAG:       %[[BUFFER_0_1_5:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in8_cons_buff_0_2"} : memref<32x32xi32>
-// CHECK-DAG:       %[[BUFFER_0_1_6:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in8_cons_buff_0_3"} : memref<32x32xi32>
-// CHECK-DAG:       %[[LOCK_0_1:.*]] = aie.lock(%[[TILE_0_1]]) {init = 4 : i8, sym_name = "in8_cons_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_0_1_7:.*]] = aie.lock(%[[TILE_0_1]]) {init = 0 : i8, sym_name = "in8_cons_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_0_1_8:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in7_prod_buff_0_0"} : memref<64x32xi32>
-// CHECK-DAG:       %[[BUFFER_0_1_9:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in7_prod_buff_0_1"} : memref<64x32xi32>
-// CHECK-DAG:       %[[BUFFER_0_1_10:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in7_prod_buff_0_2"} : memref<64x32xi32>
-// CHECK-DAG:       %[[BUFFER_0_1_11:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in7_prod_buff_0_3"} : memref<64x32xi32>
-// CHECK-DAG:       %[[LOCK_0_1_12:.*]] = aie.lock(%[[TILE_0_1]]) {init = 4 : i8, sym_name = "in7_prod_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_0_1_13:.*]] = aie.lock(%[[TILE_0_1]]) {init = 0 : i8, sym_name = "in7_prod_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_1_2_14:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in7_cons_buff_0_0"} : memref<64x32xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_15:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in7_cons_buff_0_1"} : memref<64x32xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_16:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in7_cons_buff_0_2"} : memref<64x32xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_17:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in7_cons_buff_0_3"} : memref<64x32xi32>
-// CHECK-DAG:       %[[LOCK_1_2_18:.*]] = aie.lock(%[[TILE_1_2]]) {init = 4 : i8, sym_name = "in7_cons_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_1_2_19:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "in7_cons_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_0_1_20:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in2_prod_buff_0_0"} : memref<32x64xi32>
-// CHECK-DAG:       %[[BUFFER_0_1_21:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in2_prod_buff_0_1"} : memref<32x64xi32>
-// CHECK-DAG:       %[[BUFFER_0_1_22:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in2_prod_buff_0_2"} : memref<32x64xi32>
-// CHECK-DAG:       %[[BUFFER_0_1_23:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "in2_prod_buff_0_3"} : memref<32x64xi32>
-// CHECK-DAG:       %[[LOCK_0_1_24:.*]] = aie.lock(%[[TILE_0_1]]) {init = 4 : i8, sym_name = "in2_prod_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_0_1_25:.*]] = aie.lock(%[[TILE_0_1]]) {init = 0 : i8, sym_name = "in2_prod_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_0_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "in2_cons_buff_0_0"} : memref<32x64xi32>
-// CHECK-DAG:       %[[BUFFER_0_2_26:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "in2_cons_buff_0_1"} : memref<32x64xi32>
-// CHECK-DAG:       %[[BUFFER_0_2_27:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "in2_cons_buff_0_2"} : memref<32x64xi32>
-// CHECK-DAG:       %[[BUFFER_0_2_28:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "in2_cons_buff_0_3"} : memref<32x64xi32>
-// CHECK-DAG:       %[[LOCK_0_2:.*]] = aie.lock(%[[TILE_0_2]]) {init = 4 : i8, sym_name = "in2_cons_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_0_2_29:.*]] = aie.lock(%[[TILE_0_2]]) {init = 0 : i8, sym_name = "in2_cons_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_1_2_30:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in2_cons_buff_1_0"} : memref<32x64xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_31:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in2_cons_buff_1_1"} : memref<32x64xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_32:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in2_cons_buff_1_2"} : memref<32x64xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_33:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "in2_cons_buff_1_3"} : memref<32x64xi32>
-// CHECK-DAG:       %[[LOCK_1_2_34:.*]] = aie.lock(%[[TILE_1_2]]) {init = 4 : i8, sym_name = "in2_cons_prod_lock_1"}
-// CHECK-DAG:       %[[LOCK_1_2_35:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "in2_cons_cons_lock_1"}
-// CHECK-DAG:       aie.flow(%[[TILE_0_1]], DMA : 0, %[[TILE_1_2]], DMA : 0) {symbol = @in2}
-// CHECK-DAG:       aie.flow(%[[TILE_0_1]], DMA : 0, %[[TILE_0_2]], DMA : 0) {symbol = @in2}
-// CHECK-DAG:       aie.flow(%[[TILE_0_1]], DMA : 1, %[[TILE_1_2]], DMA : 1) {symbol = @in7}
-// CHECK-DAG:       aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_0_1]], DMA : 0) {symbol = @in8}
-// CHECK:           %[[CORE_1_2:.*]] = aie.core(%[[TILE_1_2]]) {
-// CHECK:             %[[C8:.*]] = arith.constant 8 : index
-// CHECK:             %[[C1:.*]] = arith.constant 1 : index
-// CHECK:             %[[C4:.*]] = arith.constant 4 : index
-// CHECK:             %[[C0:.*]] = arith.constant 0 : index
-// CHECK:             %[[C64:.*]] = arith.constant 64 : index
-// CHECK:             %[[C128:.*]] = arith.constant 128 : index
-// CHECK:             %[[C960:.*]] = arith.constant 960 : index
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1)
-// CHECK:             %[[REINTERPRET_CAST:.*]] = memref.reinterpret_cast %[[BUFFER_1_2]] to offset: [0], sizes: [4, 8, 4, 8], strides: [256, 32, 8, 1] : memref<32x32xi32> to memref<4x8x4x8xi32>
-// CHECK:             aie.use_lock(%[[LOCK_1_2_34]], Release, 1)
-// CHECK:             aie.use_lock(%[[LOCK_1_2_18]], Release, 1)
-// CHECK:             scf.for %[[ARG0:.*]] = %[[C64]] to %[[C960]] step %[[C128]] {
-// CHECK:               aie.use_lock(%[[LOCK_1_2_35]], AcquireGreaterEqual, 1)
-// CHECK:               %[[REINTERPRET_CAST_36:.*]] = memref.reinterpret_cast %[[BUFFER_1_2_30]] to offset: [0], sizes: [8, 8, 4, 8], strides: [256, 32, 8, 1] : memref<32x64xi32> to memref<8x8x4x8xi32>
-// CHECK:               aie.use_lock(%[[LOCK_1_2_19]], AcquireGreaterEqual, 1)
-// CHECK:               %[[REINTERPRET_CAST_37:.*]] = memref.reinterpret_cast %[[BUFFER_1_2_14]] to offset: [0], sizes: [4, 8, 8, 8], strides: [512, 64, 8, 1] : memref<64x32xi32> to memref<4x8x8x8xi32>
-// CHECK:               scf.for %[[ARG1:.*]] = %[[C0]] to %[[C8]] step %[[C1]] {
-// CHECK:                 scf.for %[[ARG2:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
-// CHECK:                   scf.for %[[ARG3:.*]] = %[[C0]] to %[[C8]] step %[[C1]] {
-// CHECK:                     scf.for %[[ARG4:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
-// CHECK:                       scf.for %[[ARG5:.*]] = %[[C0]] to %[[C8]] step %[[C1]] {
-// CHECK:                         scf.for %[[ARG6:.*]] = %[[C0]] to %[[C8]] step %[[C1]] {
-// CHECK:                           %[[VAL_0:.*]] = memref.load %[[REINTERPRET_CAST_36]]{{\[}}%[[ARG3]], %[[ARG1]], %[[ARG4]], %[[ARG6]]] : memref<8x8x4x8xi32>
-// CHECK:                           %[[VAL_1:.*]] = memref.load %[[REINTERPRET_CAST_37]]{{\[}}%[[ARG2]], %[[ARG3]], %[[ARG6]], %[[ARG5]]] : memref<4x8x8x8xi32>
-// CHECK:                           %[[VAL_2:.*]] = memref.load %[[REINTERPRET_CAST]]{{\[}}%[[ARG2]], %[[ARG1]], %[[ARG4]], %[[ARG5]]] : memref<4x8x4x8xi32>
-// CHECK:                           %[[VAL_3:.*]] = arith.muli %[[VAL_0]], %[[VAL_1]] : i32
-// CHECK:                           %[[VAL_4:.*]] = arith.addi %[[VAL_2]], %[[VAL_3]] : i32
-// CHECK:                           memref.store %[[VAL_4]], %[[REINTERPRET_CAST]]{{\[}}%[[ARG2]], %[[ARG1]], %[[ARG4]], %[[ARG5]]] : memref<4x8x4x8xi32>
-// CHECK:                         }
-// CHECK:                       }
-// CHECK:                     }
-// CHECK:                   }
-// CHECK:                 }
-// CHECK:               }
-// CHECK:               aie.use_lock(%[[LOCK_1_2_34]], Release, 1)
-// CHECK:               aie.use_lock(%[[LOCK_1_2_18]], Release, 1)
-// CHECK:               aie.use_lock(%[[LOCK_1_2_35]], AcquireGreaterEqual, 1)
-// CHECK:               %[[REINTERPRET_CAST_38:.*]] = memref.reinterpret_cast %[[BUFFER_1_2_31]] to offset: [0], sizes: [8, 8, 4, 8], strides: [256, 32, 8, 1] : memref<32x64xi32> to memref<8x8x4x8xi32>
-// CHECK:               aie.use_lock(%[[LOCK_1_2_19]], AcquireGreaterEqual, 1)
-// CHECK:               %[[REINTERPRET_CAST_39:.*]] = memref.reinterpret_cast %[[BUFFER_1_2_15]] to offset: [0], sizes: [4, 8, 8, 8], strides: [512, 64, 8, 1] : memref<64x32xi32> to memref<4x8x8x8xi32>
-// CHECK:               scf.for %[[ARG1:.*]] = %[[C0]] to %[[C8]] step %[[C1]] {
-// CHECK:                 scf.for %[[ARG2:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
-// CHECK:                   scf.for %[[ARG3:.*]] = %[[C0]] to %[[C8]] step %[[C1]] {
-// CHECK:                     scf.for %[[ARG4:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
-// CHECK:                       scf.for %[[ARG5:.*]] = %[[C0]] to %[[C8]] step %[[C1]] {
-// CHECK:                         scf.for %[[ARG6:.*]] = %[[C0]] to %[[C8]] step %[[C1]] {
-// CHECK:                           %[[VAL_5:.*]] = memref.load %[[REINTERPRET_CAST_38]]{{\[}}%[[ARG3]], %[[ARG1]], %[[ARG4]], %[[ARG6]]] : memref<8x8x4x8xi32>
-// CHECK:                           %[[VAL_6:.*]] = memref.load %[[REINTERPRET_CAST_39]]{{\[}}%[[ARG2]], %[[ARG3]], %[[ARG6]], %[[ARG5]]] : memref<4x8x8x8xi32>
-// CHECK:                           %[[VAL_7:.*]] = memref.load %[[REINTERPRET_CAST]]{{\[}}%[[ARG2]], %[[ARG1]], %[[ARG4]], %[[ARG5]]] : memref<4x8x4x8xi32>
-// CHECK:                           %[[VAL_8:.*]] = arith.muli %[[VAL_5]], %[[VAL_6]] : i32
-// CHECK:                           %[[VAL_9:.*]] = arith.addi %[[VAL_7]], %[[VAL_8]] : i32
-// CHECK:                           memref.store %[[VAL_9]], %[[REINTERPRET_CAST]]{{\[}}%[[ARG2]], %[[ARG1]], %[[ARG4]], %[[ARG5]]] : memref<4x8x4x8xi32>
-// CHECK:                         }
-// CHECK:                       }
-// CHECK:                     }
-// CHECK:                   }
-// CHECK:                 }
-// CHECK:               }
-// CHECK:               aie.use_lock(%[[LOCK_1_2_34]], Release, 1)
-// CHECK:               aie.use_lock(%[[LOCK_1_2_18]], Release, 1)
-// CHECK:             }
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEMTILE_DMA_0_1:.*]] = aie.memtile_dma(%[[TILE_0_1]]) {
-// CHECK:             %[[VAL_10:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb5)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_0_1_25]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_0_1_20]] : memref<32x64xi32>) {len = 2048 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_0_1_24]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_0_1_25]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_0_1_21]] : memref<32x64xi32>) {len = 2048 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_0_1_24]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[LOCK_0_1_25]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_0_1_22]] : memref<32x64xi32>) {len = 2048 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_0_1_24]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[LOCK_0_1_25]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_0_1_23]] : memref<32x64xi32>) {len = 2048 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_0_1_24]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb5:
-// CHECK:             %[[VAL_11:.*]] = aie.dma_start(MM2S, 1, ^bb6, ^bb10)
-// CHECK:           ^bb6:
-// CHECK:             aie.use_lock(%[[LOCK_0_1_13]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_0_1_8]] : memref<64x32xi32>) {len = 2048 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_0_1_12]], Release, 1)
-// CHECK:             aie.next_bd ^bb7
-// CHECK:           ^bb7:
-// CHECK:             aie.use_lock(%[[LOCK_0_1_13]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_0_1_9]] : memref<64x32xi32>) {len = 2048 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_0_1_12]], Release, 1)
-// CHECK:             aie.next_bd ^bb8
-// CHECK:           ^bb8:
-// CHECK:             aie.use_lock(%[[LOCK_0_1_13]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_0_1_10]] : memref<64x32xi32>) {len = 2048 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_0_1_12]], Release, 1)
-// CHECK:             aie.next_bd ^bb9
-// CHECK:           ^bb9:
-// CHECK:             aie.use_lock(%[[LOCK_0_1_13]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_0_1_11]] : memref<64x32xi32>) {len = 2048 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_0_1_12]], Release, 1)
-// CHECK:             aie.next_bd ^bb6
-// CHECK:           ^bb10:
-// CHECK:             %[[VAL_12:.*]] = aie.dma_start(S2MM, 0, ^bb11, ^bb15)
-// CHECK:           ^bb11:
-// CHECK:             aie.use_lock(%[[LOCK_0_1]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_0_1]] : memref<32x32xi32>) {len = 1024 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_0_1_7]], Release, 1)
-// CHECK:             aie.next_bd ^bb12
-// CHECK:           ^bb12:
-// CHECK:             aie.use_lock(%[[LOCK_0_1]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_0_1_4]] : memref<32x32xi32>) {len = 1024 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_0_1_7]], Release, 1)
-// CHECK:             aie.next_bd ^bb13
-// CHECK:           ^bb13:
-// CHECK:             aie.use_lock(%[[LOCK_0_1]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_0_1_5]] : memref<32x32xi32>) {len = 1024 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_0_1_7]], Release, 1)
-// CHECK:             aie.next_bd ^bb14
-// CHECK:           ^bb14:
-// CHECK:             aie.use_lock(%[[LOCK_0_1]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_0_1_6]] : memref<32x32xi32>) {len = 1024 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_0_1_7]], Release, 1)
-// CHECK:             aie.next_bd ^bb11
-// CHECK:           ^bb15:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_0_2:.*]] = aie.mem(%[[TILE_0_2]]) {
-// CHECK:             %[[VAL_13:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_0_2]] : memref<32x64xi32>) {len = 2048 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_0_2_29]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_0_2_26]] : memref<32x64xi32>) {len = 2048 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_0_2_29]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_0_2_27]] : memref<32x64xi32>) {len = 2048 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_0_2_29]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_0_2_28]] : memref<32x64xi32>) {len = 2048 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_0_2_29]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb5:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) {
-// CHECK:             %[[VAL_14:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_34]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_30]] : memref<32x64xi32>) {len = 2048 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2_35]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_34]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_31]] : memref<32x64xi32>) {len = 2048 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2_35]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_34]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_32]] : memref<32x64xi32>) {len = 2048 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2_35]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_34]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_33]] : memref<32x64xi32>) {len = 2048 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2_35]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb5:
-// CHECK:             %[[VAL_15:.*]] = aie.dma_start(S2MM, 1, ^bb6, ^bb10)
-// CHECK:           ^bb6:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_18]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_14]] : memref<64x32xi32>) {len = 2048 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2_19]], Release, 1)
-// CHECK:             aie.next_bd ^bb7
-// CHECK:           ^bb7:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_18]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_15]] : memref<64x32xi32>) {len = 2048 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2_19]], Release, 1)
-// CHECK:             aie.next_bd ^bb8
-// CHECK:           ^bb8:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_18]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_16]] : memref<64x32xi32>) {len = 2048 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2_19]], Release, 1)
-// CHECK:             aie.next_bd ^bb9
-// CHECK:           ^bb9:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_18]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_17]] : memref<64x32xi32>) {len = 2048 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2_19]], Release, 1)
-// CHECK:             aie.next_bd ^bb6
-// CHECK:           ^bb10:
-// CHECK:             %[[VAL_16:.*]] = aie.dma_start(MM2S, 0, ^bb11, ^bb15)
-// CHECK:           ^bb11:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2]] : memref<32x32xi32>) {len = 1024 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb12
-// CHECK:           ^bb12:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_0]] : memref<32x32xi32>) {len = 1024 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb13
-// CHECK:           ^bb13:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_1]] : memref<32x32xi32>) {len = 1024 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb14
-// CHECK:           ^bb14:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_2]] : memref<32x32xi32>) {len = 1024 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb11
-// CHECK:           ^bb15:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-aie.device(npu1_4col) {
-  %tile_0_1 = aie.tile(0, 1)
-  %tile_1_2 = aie.tile(1, 2)
-  %tile_0_2 = aie.tile(0, 2)
-  aie.flow(%tile_0_1, DMA : 0, %tile_1_2, DMA : 0) {symbol = @in2}
-  aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0) {symbol = @in2}
-  aie.flow(%tile_0_1, DMA : 1, %tile_1_2, DMA : 1) {symbol = @in7}
-  aie.flow(%tile_1_2, DMA : 0, %tile_0_1, DMA : 0) {symbol = @in8}
-  aie.objectfifo @in2(%tile_0_1, {%tile_0_2, %tile_1_2}, 4 : i32) : !aie.objectfifo<memref<32x64xi32>>
-  aie.objectfifo @in7(%tile_0_1, {%tile_1_2}, 4 : i32) : !aie.objectfifo<memref<64x32xi32>>
-  aie.objectfifo @in8(%tile_1_2, {%tile_0_1}, 4 : i32) : !aie.objectfifo<memref<32x32xi32>>
-  %core_1_2 = aie.core(%tile_1_2) {
-    %c8 = arith.constant 8 : index
-    %c1 = arith.constant 1 : index
-    %c4 = arith.constant 4 : index
-    %c0 = arith.constant 0 : index
-    %c64 = arith.constant 64 : index
-    %c128 = arith.constant 128 : index
-    %c960 = arith.constant 960 : index
-    %0 = aie.objectfifo.acquire @in8(Produce, 1) : !aie.objectfifosubview<memref<32x32xi32>>
-    %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<32x32xi32>> -> memref<32x32xi32>
-    %reinterpret_cast = memref.reinterpret_cast %1 to offset: [0], sizes: [4, 8, 4, 8], strides: [256, 32, 8, 1] : memref<32x32xi32> to memref<4x8x4x8xi32>
-    aie.objectfifo.release @in2(Consume, 1)
-    aie.objectfifo.release @in7(Consume, 1)
-    scf.for %arg0 = %c64 to %c960 step %c128 {
-      %10 = aie.objectfifo.acquire @in2(Consume, 1) : !aie.objectfifosubview<memref<32x64xi32>>
-      %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<32x64xi32>> -> memref<32x64xi32>
-      %reinterpret_cast_4 = memref.reinterpret_cast %11 to offset: [0], sizes: [8, 8, 4, 8], strides: [256, 32, 8, 1] : memref<32x64xi32> to memref<8x8x4x8xi32>
-      %12 = aie.objectfifo.acquire @in7(Consume, 1) : !aie.objectfifosubview<memref<64x32xi32>>
-      %13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<64x32xi32>> -> memref<64x32xi32>
-      %reinterpret_cast_5 = memref.reinterpret_cast %13 to offset: [0], sizes: [4, 8, 8, 8], strides: [512, 64, 8, 1] : memref<64x32xi32> to memref<4x8x8x8xi32>
-      scf.for %arg1 = %c0 to %c8 step %c1 {
-        scf.for %arg2 = %c0 to %c4 step %c1 {
-          scf.for %arg3 = %c0 to %c8 step %c1 {
-            scf.for %arg4 = %c0 to %c4 step %c1 {
-              scf.for %arg5 = %c0 to %c8 step %c1 {
-                scf.for %arg6 = %c0 to %c8 step %c1 {
-                  %14 = memref.load %reinterpret_cast_4[%arg3, %arg1, %arg4, %arg6] : memref<8x8x4x8xi32>
-                  %15 = memref.load %reinterpret_cast_5[%arg2, %arg3, %arg6, %arg5] : memref<4x8x8x8xi32>
-                  %16 = memref.load %reinterpret_cast[%arg2, %arg1, %arg4, %arg5] : memref<4x8x4x8xi32>
-                  %17 = arith.muli %14, %15 : i32
-                  %18 = arith.addi %16, %17 : i32
-                  memref.store %18, %reinterpret_cast[%arg2, %arg1, %arg4, %arg5] : memref<4x8x4x8xi32>
-                }
-              }
-            }
-          }
-        }
-      }
-      aie.objectfifo.release @in2(Consume, 1)
-      aie.objectfifo.release @in7(Consume, 1)
-      %19 = aie.objectfifo.acquire @in2(Consume, 1) : !aie.objectfifosubview<memref<32x64xi32>>
-      %20 = aie.objectfifo.subview.access %19[0] : !aie.objectfifosubview<memref<32x64xi32>> -> memref<32x64xi32>
-      %reinterpret_cast_6 = memref.reinterpret_cast %20 to offset: [0], sizes: [8, 8, 4, 8], strides: [256, 32, 8, 1] : memref<32x64xi32> to memref<8x8x4x8xi32>
-      %21 = aie.objectfifo.acquire @in7(Consume, 1) : !aie.objectfifosubview<memref<64x32xi32>>
-      %22 = aie.objectfifo.subview.access %21[0] : !aie.objectfifosubview<memref<64x32xi32>> -> memref<64x32xi32>
-      %reinterpret_cast_7 = memref.reinterpret_cast %22 to offset: [0], sizes: [4, 8, 8, 8], strides: [512, 64, 8, 1] : memref<64x32xi32> to memref<4x8x8x8xi32>
-      scf.for %arg1 = %c0 to %c8 step %c1 {
-        scf.for %arg2 = %c0 to %c4 step %c1 {
-          scf.for %arg3 = %c0 to %c8 step %c1 {
-            scf.for %arg4 = %c0 to %c4 step %c1 {
-              scf.for %arg5 = %c0 to %c8 step %c1 {
-                scf.for %arg6 = %c0 to %c8 step %c1 {
-                  %23 = memref.load %reinterpret_cast_6[%arg3, %arg1, %arg4, %arg6] : memref<8x8x4x8xi32>
-                  %24 = memref.load %reinterpret_cast_7[%arg2, %arg3, %arg6, %arg5] : memref<4x8x8x8xi32>
-                  %25 = memref.load %reinterpret_cast[%arg2, %arg1, %arg4, %arg5] : memref<4x8x4x8xi32>
-                  %26 = arith.muli %23, %24 : i32
-                  %27 = arith.addi %25, %26 : i32
-                  memref.store %27, %reinterpret_cast[%arg2, %arg1, %arg4, %arg5] : memref<4x8x4x8xi32>
-                }
-              }
-            }
-          }
-        }
-      }
-      aie.objectfifo.release @in2(Consume, 1)
-      aie.objectfifo.release @in7(Consume, 1)
-    }
-    aie.end
-  }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_1.mlir b/compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_1.mlir
deleted file mode 100644
index 5e5349a8b..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_1.mlir
+++ /dev/null
@@ -1,125 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(npu1_4col) {
-// CHECK:           memref.global "public" @objfifo : memref<16xi32>
-// CHECK-DAG:       %[[TILE_1_2:.*]] = aie.tile(1, 2)
-// CHECK-DAG:       %[[TILE_3_3:.*]] = aie.tile(3, 3)
-// CHECK-DAG:       %[[OBJFIFO_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "objfifo_cons_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[OBJFIFO_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "objfifo_cons_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[OBJFIFO_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "objfifo_cons_prod_lock_0"}
-// CHECK-DAG:       %[[OBJFIFO_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "objfifo_cons_cons_lock_0"}
-// CHECK-DAG:       %[[OBJFIFO_BUFF_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "objfifo_prod_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[OBJFIFO_BUFF_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "objfifo_prod_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[OBJFIFO_PROD_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 2 : i8, sym_name = "objfifo_prod_prod_lock_0"}
-// CHECK-DAG:       %[[OBJFIFO_CONS_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "objfifo_prod_cons_lock_0"}
-// CHECK-DAG:       aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_3_3]], DMA : 0)
-// CHECK:           func.func @some_work(%[[ARG0:.*]]: memref<16xi32>) {
-// CHECK:             return
-// CHECK:           }
-// CHECK:           %[[CORE_1_2:.*]] = aie.core(%[[TILE_1_2]]) {
-// CHECK-DAG:         %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:         %[[C1:.*]] = arith.constant 1 : index
-// CHECK-DAG:         %[[C2:.*]] = arith.constant 2 : index
-// CHECK-DAG:         %[[C12:.*]] = arith.constant 12 : index
-// CHECK:             scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C2]] {
-// CHECK:               aie.use_lock(%[[OBJFIFO_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:               func.call @some_work(%[[OBJFIFO_BUFF_0]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[OBJFIFO_CONS_LOCK]], Release, 1)
-// CHECK:               aie.use_lock(%[[OBJFIFO_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:               func.call @some_work(%[[OBJFIFO_BUFF_1]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[OBJFIFO_CONS_LOCK]], Release, 1)
-// CHECK:             }
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[CORE_3_3:.*]] = aie.core(%[[TILE_3_3]]) {
-// CHECK-DAG:         %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:         %[[C1:.*]] = arith.constant 1 : index
-// CHECK-DAG:         %[[C2:.*]] = arith.constant 2 : index
-// CHECK-DAG:         %[[C12:.*]] = arith.constant 12 : index
-// CHECK:             scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C2]] {
-// CHECK:               aie.use_lock(%[[OBJFIFO_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:               func.call @some_work(%[[OBJFIFO_CONS_BUFF_0]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[OBJFIFO_CONS_PROD_LOCK]], Release, 1)
-// CHECK:               aie.use_lock(%[[OBJFIFO_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:               func.call @some_work(%[[OBJFIFO_CONS_BUFF_1]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[OBJFIFO_CONS_PROD_LOCK]], Release, 1)
-// CHECK:             }
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) {
-// CHECK:             %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OBJFIFO_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OBJFIFO_BUFF_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OBJFIFO_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OBJFIFO_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OBJFIFO_BUFF_1]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OBJFIFO_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) {
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OBJFIFO_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OBJFIFO_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OBJFIFO_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OBJFIFO_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OBJFIFO_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OBJFIFO_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-module @non_adjacency {
-    aie.device(npu1_4col) {
-        %tile12 = aie.tile(1, 2)
-        %tile33 = aie.tile(3, 3)
-        aie.flow(%tile12, DMA : 0, %tile33, DMA : 0) {symbol = @objfifo}
-        aie.objectfifo @objfifo (%tile12, {%tile33}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
-        func.func @some_work(%lineOut : memref<16xi32>) -> () {
-            return
-        }
-        %core12 = aie.core(%tile12) {
-            %c0 = arith.constant 0 : index
-            %c1 = arith.constant 1 : index
-            %c2 = arith.constant 2 : index
-            %height = arith.constant 12 : index
-            scf.for %indexInHeight = %c0 to %height step %c2 {
-                %subview = aie.objectfifo.acquire @objfifo (Produce, 1) : !aie.objectfifosubview<memref<16xi32>>
-                %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-                func.call @some_work(%elem0) : (memref<16xi32>) -> ()
-                aie.objectfifo.release @objfifo (Produce, 1)
-                %subview1 = aie.objectfifo.acquire @objfifo (Produce, 1) : !aie.objectfifosubview<memref<16xi32>>
-                %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-                func.call @some_work(%elem1) : (memref<16xi32>) -> ()
-                aie.objectfifo.release @objfifo (Produce, 1)
-            }
-            aie.end
-        }
-        %core33 = aie.core(%tile33) {
-            %c0 = arith.constant 0 : index
-            %c1 = arith.constant 1 : index
-            %c2 = arith.constant 2 : index
-            %height = arith.constant 12 : index
-            scf.for %indexInHeight = %c0 to %height step %c2 {
-                %subview = aie.objectfifo.acquire @objfifo (Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
-                %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-                func.call @some_work(%elem0) : (memref<16xi32>) -> ()
-                aie.objectfifo.release @objfifo (Consume, 1)
-                %subview1 = aie.objectfifo.acquire @objfifo (Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
-                %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-                func.call @some_work(%elem1) : (memref<16xi32>) -> ()
-                aie.objectfifo.release @objfifo (Consume, 1)
-            }
-            aie.end
-        }
-    }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_2.mlir b/compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_2.mlir
deleted file mode 100644
index 480e9cfea..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_2.mlir
+++ /dev/null
@@ -1,139 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(npu1_4col) {
-// CHECK:           memref.global "public" @objfifo : memref<16xi32>
-// CHECK-DAG:       %[[TILE_1_2:.*]] = aie.tile(1, 2)
-// CHECK-DAG:       %[[TILE_3_3:.*]] = aie.tile(3, 3)
-// CHECK-DAG:       %[[BUFFER_1_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "objfifo_prod_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "objfifo_prod_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_1_2:.*]] = aie.lock(%[[TILE_1_2]]) {init = 2 : i8, sym_name = "objfifo_prod_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_1_2_1:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "objfifo_prod_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_3_3:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "objfifo_cons_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_3_3_2:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "objfifo_cons_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_3_3:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "objfifo_cons_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_3_3_3:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "objfifo_cons_cons_lock_0"}
-// CHECK-DAG:       aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_3_3]], DMA : 0) {symbol = @objfifo}
-// CHECK:           func.func @some_work(%[[ARG0:.*]]: memref<16xi32>) {
-// CHECK:             return
-// CHECK:           }
-// CHECK:           %[[CORE_1_2:.*]] = aie.core(%[[TILE_1_2]]) {
-// CHECK:             %[[C0:.*]] = arith.constant 0 : index
-// CHECK:             %[[C2:.*]] = arith.constant 2 : index
-// CHECK:             %[[C12:.*]] = arith.constant 12 : index
-// CHECK:             scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C2]] {
-// CHECK:               aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1)
-// CHECK:               func.call @some_work(%[[BUFFER_1_2]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[LOCK_1_2_1]], Release, 1)
-// CHECK:               aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1)
-// CHECK:               func.call @some_work(%[[BUFFER_1_2_0]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[LOCK_1_2_1]], Release, 1)
-// CHECK:             }
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[CORE_3_3:.*]] = aie.core(%[[TILE_3_3]]) {
-// CHECK:             %[[C0:.*]] = arith.constant 0 : index
-// CHECK:             %[[C4:.*]] = arith.constant 4 : index
-// CHECK:             %[[C12:.*]] = arith.constant 12 : index
-// CHECK:             scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C4]] {
-// CHECK:               aie.use_lock(%[[LOCK_3_3_3]], AcquireGreaterEqual, 2)
-// CHECK:               func.call @some_work(%[[BUFFER_3_3]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[LOCK_3_3]], Release, 2)
-// CHECK:               aie.use_lock(%[[LOCK_3_3_3]], AcquireGreaterEqual, 2)
-// CHECK:               func.call @some_work(%[[BUFFER_3_3]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[LOCK_3_3]], Release, 2)
-// CHECK:               aie.use_lock(%[[LOCK_3_3_3]], AcquireGreaterEqual, 2)
-// CHECK:               func.call @some_work(%[[BUFFER_3_3]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[LOCK_3_3]], Release, 2)
-// CHECK:               aie.use_lock(%[[LOCK_3_3_3]], AcquireGreaterEqual, 2)
-// CHECK:               func.call @some_work(%[[BUFFER_3_3]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[LOCK_3_3]], Release, 2)
-// CHECK:             }
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) {
-// CHECK:             %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_1]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_1]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) {
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_3_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_3_3]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_3_3_3]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_3_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_3_3_2]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_3_3_3]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-module @non_adjacency {
-  aie.device(npu1_4col) {
-    %tile12 = aie.tile(1, 2)
-    %tile33 = aie.tile(3, 3)
-    aie.flow(%tile12, DMA : 0, %tile33, DMA : 0) {symbol = @objfifo}
-    aie.objectfifo @objfifo (%tile12, {%tile33}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
-    func.func @some_work(%lineOut : memref<16xi32>) -> () {
-      return
-    }
-    %core12 = aie.core(%tile12) {
-      %c0 = arith.constant 0 : index
-      %c2 = arith.constant 2 : index
-      %height = arith.constant 12 : index
-      scf.for %indexInHeight = %c0 to %height step %c2 {
-        %subview = aie.objectfifo.acquire @objfifo (Produce, 1) : !aie.objectfifosubview<memref<16xi32>>
-        %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-        func.call @some_work(%elem0) : (memref<16xi32>) -> ()
-        aie.objectfifo.release @objfifo (Produce, 1)
-        %subview1 = aie.objectfifo.acquire @objfifo (Produce, 1) : !aie.objectfifosubview<memref<16xi32>>
-        %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-        func.call @some_work(%elem1) : (memref<16xi32>) -> ()
-        aie.objectfifo.release @objfifo (Produce, 1)
-      }
-      aie.end
-    }
-    %core33 = aie.core(%tile33) {
-      %c0 = arith.constant 0 : index
-      %c4 = arith.constant 4 : index
-      %height = arith.constant 12 : index
-      scf.for %indexInHeight = %c0 to %height step %c4 {
-        %subview = aie.objectfifo.acquire @objfifo (Consume, 2) : !aie.objectfifosubview<memref<16xi32>>
-        %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-        %elem1 = aie.objectfifo.subview.access %subview[1] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-        func.call @some_work(%elem0) : (memref<16xi32>) -> ()
-        aie.objectfifo.release @objfifo (Consume, 2)
-        %subview1 = aie.objectfifo.acquire @objfifo (Consume, 2) : !aie.objectfifosubview<memref<16xi32>>
-        %elem3 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-        %elem4 = aie.objectfifo.subview.access %subview1[1] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-        func.call @some_work(%elem3) : (memref<16xi32>) -> ()
-        aie.objectfifo.release @objfifo (Consume, 2)
-        %subview2 = aie.objectfifo.acquire @objfifo (Consume, 2) : !aie.objectfifosubview<memref<16xi32>>
-        %elem6 = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-        %elem7 = aie.objectfifo.subview.access %subview2[1] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-        func.call @some_work(%elem6) : (memref<16xi32>) -> ()
-        aie.objectfifo.release @objfifo (Consume, 2)
-        %subview3 = aie.objectfifo.acquire @objfifo (Consume, 2) : !aie.objectfifosubview<memref<16xi32>>
-        %elem9 = aie.objectfifo.subview.access %subview3[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-        %elem10 = aie.objectfifo.subview.access %subview3[1] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-        func.call @some_work(%elem9) : (memref<16xi32>) -> ()
-        aie.objectfifo.release @objfifo (Consume, 2)
-      }
-      aie.end
-    }
-  }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_AIE2.mlir b/compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_AIE2.mlir
deleted file mode 100644
index 7ae508421..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/non_adjacency_test_AIE2.mlir
+++ /dev/null
@@ -1,122 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(xcve2302) {
-// CHECK:           memref.global "public" @of : memref<16xi32>
-// CHECK-DAG:       %[[TILE_1_2:.*]] = aie.tile(1, 2)
-// CHECK-DAG:       %[[TILE_3_3:.*]] = aie.tile(3, 3)
-// CHECK-DAG:       %[[OF_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of_cons_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[OF_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of_cons_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[OF_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "of_cons_prod_lock_0"}
-// CHECK-DAG:       %[[OF_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "of_cons_cons_lock_0"}
-// CHECK-DAG:       %[[OF_BUFF_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of_prod_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[OF_BUFF_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of_prod_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[OF_PROD_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 2 : i8, sym_name = "of_prod_prod_lock_0"}
-// CHECK-DAG:       %[[OF_CONS_LOCK:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of_prod_cons_lock_0"}
-// CHECK-DAG:       aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_3_3]], DMA : 0)
-// CHECK:           func.func @some_work(%[[ARG0:.*]]: memref<16xi32>) {
-// CHECK:             return
-// CHECK:           }
-// CHECK:           %[[CORE_1_2:.*]] = aie.core(%[[TILE_1_2]]) {
-// CHECK-DAG:         %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:         %[[C2:.*]] = arith.constant 2 : index
-// CHECK-DAG:         %[[C12:.*]] = arith.constant 12 : index
-// CHECK:             scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C2]] {
-// CHECK:               aie.use_lock(%[[OF_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:               func.call @some_work(%[[OF_BUFF_0]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[OF_CONS_LOCK]], Release, 1)
-// CHECK:               aie.use_lock(%[[OF_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:               func.call @some_work(%[[OF_BUFF_1]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[OF_CONS_LOCK]], Release, 1)
-// CHECK:             }
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[CORE_3_3:.*]] = aie.core(%[[TILE_3_3]]) {
-// CHECK-DAG:         %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:         %[[C2:.*]] = arith.constant 2 : index
-// CHECK-DAG:         %[[C12:.*]] = arith.constant 12 : index
-// CHECK:             scf.for %[[ARG0:.*]] = %[[C0]] to %[[C12]] step %[[C2]] {
-// CHECK:               aie.use_lock(%[[OF_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:               func.call @some_work(%[[OF_CONS_BUFF_0]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[OF_CONS_PROD_LOCK]], Release, 1)
-// CHECK:               aie.use_lock(%[[OF_CONS_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:               func.call @some_work(%[[OF_CONS_BUFF_1]]) : (memref<16xi32>) -> ()
-// CHECK:               aie.use_lock(%[[OF_CONS_PROD_LOCK]], Release, 1)
-// CHECK:             }
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) {
-// CHECK:             %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OF_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF_BUFF_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OF_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF_BUFF_1]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) {
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OF_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OF_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-
-module @non_adjacency_AIE2 {
-    aie.device(xcve2302) {
-        %tile12 = aie.tile(1, 2)
-        %tile33 = aie.tile(3, 3)
-        aie.flow(%tile12, DMA : 0, %tile33, DMA : 0) {symbol = @of}
-        aie.objectfifo @of (%tile12, {%tile33}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
-        func.func @some_work(%lineOut : memref<16xi32>) -> () {
-            return
-        }
-        %core12 = aie.core(%tile12) {
-            %c0 = arith.constant 0 : index
-            %c2 = arith.constant 2 : index
-            %height = arith.constant 12 : index
-            scf.for %indexInHeight = %c0 to %height step %c2 {
-                %subview = aie.objectfifo.acquire @of (Produce, 1) : !aie.objectfifosubview<memref<16xi32>>
-                %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-                func.call @some_work(%elem0) : (memref<16xi32>) -> ()
-                aie.objectfifo.release @of (Produce, 1)
-                %subview1 = aie.objectfifo.acquire @of (Produce, 1) : !aie.objectfifosubview<memref<16xi32>>
-                %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-                func.call @some_work(%elem1) : (memref<16xi32>) -> ()
-                aie.objectfifo.release @of (Produce, 1)
-            }
-            aie.end
-        }
-        %core33 = aie.core(%tile33) {
-            %c0 = arith.constant 0 : index
-            %c2 = arith.constant 2 : index
-            %height = arith.constant 12 : index
-            scf.for %indexInHeight = %c0 to %height step %c2 {
-                %subview = aie.objectfifo.acquire @of (Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
-                %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-                func.call @some_work(%elem0) : (memref<16xi32>) -> ()
-                aie.objectfifo.release @of (Consume, 1)
-                %subview1 = aie.objectfifo.acquire @of (Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
-                %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-                func.call @some_work(%elem1) : (memref<16xi32>) -> ()
-                aie.objectfifo.release @of (Consume, 1)
-            }
-            aie.end
-        }
-    }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/register_external_buffers_test.mlir b/compiler/plugins/target/AMD-AIE/aie/test/register_external_buffers_test.mlir
deleted file mode 100644
index 05a734695..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/register_external_buffers_test.mlir
+++ /dev/null
@@ -1,75 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(npu1_4col) {
-// CHECK:           memref.global "public" @ext_of : memref<16xi32>
-// CHECK-DAG:       %[[TILE_3_2:.*]] = aie.tile(3, 2)
-// CHECK-DAG:       %[[TILE_3_0:.*]] = aie.tile(3, 0)
-// CHECK-DAG:       %[[LOCK_3_0:.*]] = aie.lock(%[[TILE_3_0]]) {init = 0 : i8, sym_name = "ext_of_prod_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_3_0_0:.*]] = aie.lock(%[[TILE_3_0]]) {init = 0 : i8, sym_name = "ext_of_prod_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_3_2:.*]] = aie.buffer(%[[TILE_3_2]]) {sym_name = "ext_of_cons_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_3_2_1:.*]] = aie.buffer(%[[TILE_3_2]]) {sym_name = "ext_of_cons_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_3_2_2:.*]] = aie.buffer(%[[TILE_3_2]]) {sym_name = "ext_of_cons_buff_0_2"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_3_2:.*]] = aie.lock(%[[TILE_3_2]]) {init = 3 : i8, sym_name = "ext_of_cons_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_3_2_3:.*]] = aie.lock(%[[TILE_3_2]]) {init = 0 : i8, sym_name = "ext_of_cons_cons_lock_0"}
-// CHECK-DAG:       aie.flow(%[[TILE_3_0]], DMA : 0, %[[TILE_3_2]], DMA : 0) {symbol = @ext_of}
-// CHECK:           %[[VAL_0:.*]] = aie.external_buffer {sym_name = "ext_buffer_in"} : memref<64xi32>
-// CHECK:           aie.objectfifo.register_external_buffers @ext_of(%[[TILE_3_0]], {%[[VAL_0]]}) : (memref<64xi32>)
-// CHECK:           func.func @some_work(%[[ARG0:.*]]: memref<16xi32>, %[[ARG1:.*]]: memref<16xi32>) {
-// CHECK:             return
-// CHECK:           }
-// CHECK:           aie.shim_dma_allocation @ext_of(MM2S, 0, 3)
-// CHECK:           %[[CORE_3_2:.*]] = aie.core(%[[TILE_3_2]]) {
-// CHECK:             %[[C0:.*]] = arith.constant 0 : index
-// CHECK:             %[[C1:.*]] = arith.constant 1 : index
-// CHECK:             %[[C12:.*]] = arith.constant 12 : index
-// CHECK:             aie.use_lock(%[[LOCK_3_2_3]], AcquireGreaterEqual, 3)
-// CHECK:             func.call @some_work(%[[BUFFER_3_2]], %[[BUFFER_3_2_1]]) : (memref<16xi32>, memref<16xi32>) -> ()
-// CHECK:             aie.use_lock(%[[LOCK_3_2]], Release, 3)
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_3_2:.*]] = aie.mem(%[[TILE_3_2]]) {
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb4)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_3_2]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_3_2]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_3_2_3]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_3_2]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_3_2_1]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_3_2_3]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[LOCK_3_2]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_3_2_2]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_3_2_3]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb4:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-module @register_external_buffers {
-  aie.device(npu1_4col) {
-    %tile32 = aie.tile(3, 2)
-    %tile30 = aie.tile(3, 0)
-    aie.flow(%tile30, DMA : 0, %tile32, DMA : 0) {symbol = @ext_of}
-    aie.objectfifo @ext_of (%tile30, {%tile32}, 3 : i32) : !aie.objectfifo<memref<16xi32>>
-    %ext_buffer_in = aie.external_buffer {sym_name = "ext_buffer_in"}: memref<64xi32>
-    aie.objectfifo.register_external_buffers @ext_of (%tile30, {%ext_buffer_in}) : (memref<64xi32>)
-    func.func @some_work(%a : memref<16xi32>, %b : memref<16xi32>) -> () {
-      return
-    }
-    %core71 = aie.core(%tile32) {
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 1 : index
-      %height = arith.constant 12 : index
-      %subview = aie.objectfifo.acquire @ext_of (Consume, 3) : !aie.objectfifosubview<memref<16xi32>>
-      %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-      %elem1 = aie.objectfifo.subview.access %subview[1] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-      func.call @some_work(%elem0, %elem1) : (memref<16xi32>, memref<16xi32>) -> ()
-      aie.objectfifo.release @ext_of (Consume, 3)
-      aie.end
-    }
-  }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/same_core_producer_consumer_test.mlir b/compiler/plugins/target/AMD-AIE/aie/test/same_core_producer_consumer_test.mlir
deleted file mode 100644
index 3bec7a260..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/same_core_producer_consumer_test.mlir
+++ /dev/null
@@ -1,103 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(xcve2302) {
-// CHECK:           memref.global "public" @of : memref<16xi32>
-// CHECK-DAG:       %[[TILE_1_2:.*]] = aie.tile(1, 2)
-// CHECK-DAG:       %[[BUFFER_1_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of_prod_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of_prod_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of_prod_buff_0_2"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_1_2:.*]] = aie.lock(%[[TILE_1_2]]) {init = 3 : i8, sym_name = "of_prod_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_1_2_2:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of_prod_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_1_2_3:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of_cons_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_4:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of_cons_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_5:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "of_cons_buff_0_2"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_1_2_6:.*]] = aie.lock(%[[TILE_1_2]]) {init = 3 : i8, sym_name = "of_cons_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_1_2_7:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "of_cons_cons_lock_0"}
-// CHECK-DAG:       aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_1_2]], DMA : 0) {symbol = @of}
-// CHECK:           func.func @some_work(%[[ARG0:.*]]: memref<16xi32>) {
-// CHECK:             return
-// CHECK:           }
-// CHECK:           %[[CORE_1_2:.*]] = aie.core(%[[TILE_1_2]]) {
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1)
-// CHECK:             func.call @some_work(%[[BUFFER_1_2]]) : (memref<16xi32>) -> ()
-// CHECK:             aie.use_lock(%[[LOCK_1_2_2]], Release, 1)
-// CHECK:             aie.use_lock(%[[LOCK_1_2_7]], AcquireGreaterEqual, 1)
-// CHECK:             func.call @some_work(%[[BUFFER_1_2_4]]) : (memref<16xi32>) -> ()
-// CHECK:             aie.use_lock(%[[LOCK_1_2_6]], Release, 1)
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1)
-// CHECK:             func.call @some_work(%[[BUFFER_1_2_1]]) : (memref<16xi32>) -> ()
-// CHECK:             aie.use_lock(%[[LOCK_1_2_2]], Release, 1)
-// CHECK:             aie.use_lock(%[[LOCK_1_2_7]], AcquireGreaterEqual, 1)
-// CHECK:             func.call @some_work(%[[BUFFER_1_2_3]]) : (memref<16xi32>) -> ()
-// CHECK:             aie.use_lock(%[[LOCK_1_2_6]], Release, 1)
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) {
-// CHECK:             %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb4)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_2]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_2]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_2]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_1]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb4:
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb5, ^bb8)
-// CHECK:           ^bb5:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_6]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_3]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2_7]], Release, 1)
-// CHECK:             aie.next_bd ^bb6
-// CHECK:           ^bb6:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_6]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_4]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2_7]], Release, 1)
-// CHECK:             aie.next_bd ^bb7
-// CHECK:           ^bb7:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_6]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_5]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2_7]], Release, 1)
-// CHECK:             aie.next_bd ^bb5
-// CHECK:           ^bb8:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-module @same_core {
-  aie.device(xcve2302) {
-    %tile12 = aie.tile(1, 2)
-    aie.flow(%tile12, DMA : 0, %tile12, DMA : 0) {symbol = @of}
-    aie.objectfifo @of (%tile12, {%tile12}, 3 : i32) : !aie.objectfifo<memref<16xi32>>
-    func.func @some_work(%line_in:memref<16xi32>) -> () {
-      return
-    }
-    %core12 = aie.core(%tile12) {
-      // this acquires 2 elements
-      %subview0 = aie.objectfifo.acquire @of (Produce, 1) : !aie.objectfifosubview<memref<16xi32>>
-      %elem00 = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-      func.call @some_work(%elem00) : (memref<16xi32>) -> ()
-      aie.objectfifo.release @of (Produce, 1)
-      %subview1 = aie.objectfifo.acquire @of (Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
-      %elem10 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-      func.call @some_work(%elem10) : (memref<16xi32>) -> ()
-      aie.objectfifo.release @of (Consume, 1)
-      %subview2 = aie.objectfifo.acquire @of (Produce, 1) : !aie.objectfifosubview<memref<16xi32>>
-      %elem20 = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-      func.call @some_work(%elem20) : (memref<16xi32>) -> ()
-      aie.objectfifo.release @of (Produce, 1)
-      %subview3 = aie.objectfifo.acquire @of (Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
-      %elem30 = aie.objectfifo.subview.access %subview3[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-      func.call @some_work(%elem30) : (memref<16xi32>) -> ()
-      aie.objectfifo.release @of (Consume, 1)
-      aie.end
-    }
-  }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/shimRow_mem_test.mlir b/compiler/plugins/target/AMD-AIE/aie/test/shimRow_mem_test.mlir
deleted file mode 100644
index 3d636db89..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/shimRow_mem_test.mlir
+++ /dev/null
@@ -1,75 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(npu1_4col) {
-// CHECK:           memref.global "public" @objfifo : memref<16xi32>
-// CHECK-DAG:       %[[TILE_3_2:.*]] = aie.tile(3, 2)
-// CHECK-DAG:       %[[TILE_3_0:.*]] = aie.tile(3, 0)
-// CHECK-DAG:       %[[LOCK_3_0:.*]] = aie.lock(%[[TILE_3_0]]) {init = 0 : i8, sym_name = "objfifo_prod_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_3_0_0:.*]] = aie.lock(%[[TILE_3_0]]) {init = 0 : i8, sym_name = "objfifo_prod_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_3_2:.*]] = aie.buffer(%[[TILE_3_2]]) {sym_name = "objfifo_cons_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_3_2_1:.*]] = aie.buffer(%[[TILE_3_2]]) {sym_name = "objfifo_cons_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_3_2_2:.*]] = aie.buffer(%[[TILE_3_2]]) {sym_name = "objfifo_cons_buff_0_2"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_3_2:.*]] = aie.lock(%[[TILE_3_2]]) {init = 3 : i8, sym_name = "objfifo_cons_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_3_2_3:.*]] = aie.lock(%[[TILE_3_2]]) {init = 0 : i8, sym_name = "objfifo_cons_cons_lock_0"}
-// CHECK-DAG:       aie.flow(%[[TILE_3_0]], DMA : 0, %[[TILE_3_2]], DMA : 0) {symbol = @objfifo}
-// CHECK-DAG:       %[[VAL_0:.*]] = aie.external_buffer {sym_name = "ext_buffer_in"} : memref<64xi32>
-// CHECK-DAG:       aie.objectfifo.register_external_buffers @objfifo(%[[TILE_3_0]], {%[[VAL_0]]}) : (memref<64xi32>)
-// CHECK:           func.func @some_work(%[[ARG0:.*]]: memref<16xi32>, %[[ARG1:.*]]: memref<16xi32>) {
-// CHECK:             return
-// CHECK:           }
-// CHECK:           aie.shim_dma_allocation @objfifo(MM2S, 0, 3)
-// CHECK:           %[[CORE_3_2:.*]] = aie.core(%[[TILE_3_2]]) {
-// CHECK:             %[[C0:.*]] = arith.constant 0 : index
-// CHECK:             %[[C1:.*]] = arith.constant 1 : index
-// CHECK:             %[[C12:.*]] = arith.constant 12 : index
-// CHECK:             aie.use_lock(%[[LOCK_3_2_3]], AcquireGreaterEqual, 1)
-// CHECK:             func.call @some_work(%[[BUFFER_3_2]], %[[BUFFER_3_2_1]]) : (memref<16xi32>, memref<16xi32>) -> ()
-// CHECK:             aie.use_lock(%[[LOCK_3_2]], Release, 1)
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_3_2:.*]] = aie.mem(%[[TILE_3_2]]) {
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb4)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_3_2]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_3_2]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_3_2_3]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_3_2]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_3_2_1]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_3_2_3]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[LOCK_3_2]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_3_2_2]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_3_2_3]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb4:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-module @shimRow_mem {
-  aie.device(npu1_4col) {
-    %tile32 = aie.tile(3, 2)
-    %tile30 = aie.tile(3, 0)
-    aie.flow(%tile30, DMA : 0, %tile32, DMA : 0) {symbol = @objfifo}
-    aie.objectfifo @objfifo (%tile30, {%tile32}, 3 : i32) : !aie.objectfifo<memref<16xi32>>
-    %ext_buffer_in  = aie.external_buffer {sym_name = "ext_buffer_in"}: memref<64xi32>
-    aie.objectfifo.register_external_buffers @objfifo (%tile30, {%ext_buffer_in}) : (memref<64xi32>)
-    func.func @some_work(%a : memref<16xi32>, %b : memref<16xi32>) -> () {
-      return
-    }
-    %core71 = aie.core(%tile32) {
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 1 : index
-      %height = arith.constant 12 : index
-      %subview = aie.objectfifo.acquire @objfifo (Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
-      %elem0 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-      %elem1 = aie.objectfifo.subview.access %subview[1] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-      func.call @some_work(%elem0, %elem1) : (memref<16xi32>, memref<16xi32>) -> ()
-      aie.objectfifo.release @objfifo (Consume, 1)
-      aie.end
-    }
-  }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/shim_AIE2_test.mlir b/compiler/plugins/target/AMD-AIE/aie/test/shim_AIE2_test.mlir
deleted file mode 100644
index 7047e8b69..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/shim_AIE2_test.mlir
+++ /dev/null
@@ -1,68 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(xcve2302) {
-// CHECK:           memref.global "public" @of_out : memref<16xi32>
-// CHECK:           memref.global "public" @of_in : memref<16xi32>
-// CHECK-DAG:       %[[TILE_2_2:.*]] = aie.tile(2, 2)
-// CHECK-DAG:       %[[TILE_2_0:.*]] = aie.tile(2, 0)
-// CHECK-DAG:       %[[OF_OUT_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "of_out_cons_prod_lock_0"}
-// CHECK-DAG:       %[[OF_OUT_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "of_out_cons_cons_lock_0"}
-// CHECK-DAG:       %[[OF_OUT_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of_out_prod_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[OF_OUT_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of_out_prod_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[OF_OUT_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "of_out_prod_prod_lock_0"}
-// CHECK-DAG:       %[[OF_OUT_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "of_out_prod_cons_lock_0"}
-// CHECK-DAG:       %[[OF_IN_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of_in_cons_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[OF_IN_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of_in_cons_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[OF_IN_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "of_in_cons_prod_lock_0"}
-// CHECK-DAG:       %[[OF_IN_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "of_in_cons_cons_lock_0"}
-// CHECK-DAG:       %[[OF_IN_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "of_in_prod_prod_lock_0"}
-// CHECK-DAG:       %[[OF_IN_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "of_in_prod_cons_lock_0"}
-// CHECK-DAG:       aie.flow(%[[TILE_2_0]], DMA : 0, %[[TILE_2_2]], DMA : 0)
-// CHECK-DAG:       aie.flow(%[[TILE_2_2]], DMA : 0, %[[TILE_2_0]], DMA : 0)
-// CHECK-DAG:       %[[EXT_BUFFER_IN:.*]] = aie.external_buffer {sym_name = "ext_buffer_in"} : memref<64xi32>
-// CHECK-DAG:       %[[EXT_BUFFER_OUT:.*]] = aie.external_buffer {sym_name = "ext_buffer_out"} : memref<64xi32>
-// CHECK-DAG:       aie.shim_dma_allocation @of_in(MM2S, 0, 2)
-// CHECK-DAG:       aie.shim_dma_allocation @of_out(S2MM, 0, 2)
-// CHECK:           %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) {
-// CHECK:             %[[VAL_0:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OF_IN_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF_IN_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF_IN_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OF_IN_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF_IN_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF_IN_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(MM2S, 0, ^bb4, ^bb6)
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[OF_OUT_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF_OUT_BUFF_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF_OUT_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb5
-// CHECK:           ^bb5:
-// CHECK:             aie.use_lock(%[[OF_OUT_CONS_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF_OUT_BUFF_1]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF_OUT_PROD_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb6:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-module @shim_AIE2 {
-   aie.device(xcve2302) {
-      %tile22 = aie.tile(2, 2)
-      %tile20 = aie.tile(2, 0)
-      aie.flow(%tile20, DMA : 0, %tile22, DMA : 0) {symbol = @of_in}
-      aie.flow(%tile22, DMA : 0, %tile20, DMA : 0) {symbol = @of_out}
-      aie.objectfifo @of_in (%tile20, {%tile22}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
-      aie.objectfifo @of_out (%tile22, {%tile20}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
-      %ext_buffer_in  = aie.external_buffer {sym_name = "ext_buffer_in"}: memref<64xi32>
-      %ext_buffer_out  = aie.external_buffer {sym_name = "ext_buffer_out"}: memref<64xi32>
-      aie.objectfifo.register_external_buffers @of_in (%tile20, {%ext_buffer_in}) : (memref<64xi32>)
-      aie.objectfifo.register_external_buffers @of_out (%tile20, {%ext_buffer_out}) : (memref<64xi32>)
-   }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/shim_broadcast_test.mlir b/compiler/plugins/target/AMD-AIE/aie/test/shim_broadcast_test.mlir
deleted file mode 100644
index 8d0179c40..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/shim_broadcast_test.mlir
+++ /dev/null
@@ -1,88 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(xcve2302) {
-// CHECK:           memref.global "public" @of_in : memref<16xi32>
-// CHECK-DAG:       %[[TILE_2_0:.*]] = aie.tile(2, 0)
-// CHECK-DAG:       %[[TILE_2_2:.*]] = aie.tile(2, 2)
-// CHECK-DAG:       %[[TILE_2_3:.*]] = aie.tile(2, 3)
-// CHECK-DAG:       %[[TILE_3_3:.*]] = aie.tile(3, 3)
-// CHECK-DAG:       %[[OF_IN_0_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of_in_cons_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[OF_IN_0_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_2]]) {sym_name = "of_in_cons_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[OF_IN_0_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 2 : i8, sym_name = "of_in_cons_prod_lock_0"}
-// CHECK-DAG:       %[[OF_IN_0_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_2]]) {init = 0 : i8, sym_name = "of_in_cons_cons_lock_0"}
-// CHECK-DAG:       %[[OF_IN_1_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "of_in_cons_buff_1_0"} : memref<16xi32>
-// CHECK-DAG:       %[[OF_IN_1_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_2_3]]) {sym_name = "of_in_cons_buff_1_1"} : memref<16xi32>
-// CHECK-DAG:       %[[OF_IN_1_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_3]]) {init = 2 : i8, sym_name = "of_in_cons_prod_lock_1"}
-// CHECK-DAG:       %[[OF_IN_1_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_3]]) {init = 0 : i8, sym_name = "of_in_cons_cons_lock_1"}
-// CHECK-DAG:       %[[OF_IN_2_CONS_BUFF_0:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of_in_cons_buff_2_0"} : memref<16xi32>
-// CHECK-DAG:       %[[OF_IN_2_CONS_BUFF_1:.*]] = aie.buffer(%[[TILE_3_3]]) {sym_name = "of_in_cons_buff_2_1"} : memref<16xi32>
-// CHECK-DAG:       %[[OF_IN_2_CONS_PROD_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 2 : i8, sym_name = "of_in_cons_prod_lock_2"}
-// CHECK-DAG:       %[[OF_IN_2_CONS_CONS_LOCK:.*]] = aie.lock(%[[TILE_3_3]]) {init = 0 : i8, sym_name = "of_in_cons_cons_lock_2"}
-// CHECK-DAG:       %[[OF_IN_PROD_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "of_in_prod_prod_lock_0"}
-// CHECK-DAG:       %[[OF_IN_CONS_LOCK:.*]] = aie.lock(%[[TILE_2_0]]) {init = 0 : i8, sym_name = "of_in_prod_cons_lock_0"}
-// CHECK-DAG:       aie.flow(%[[TILE_2_0]], DMA : 0, %[[TILE_3_3]], DMA : 0)
-// CHECK-DAG:       aie.flow(%[[TILE_2_0]], DMA : 0, %[[TILE_2_3]], DMA : 0)
-// CHECK-DAG:       aie.flow(%[[TILE_2_0]], DMA : 0, %[[TILE_2_2]], DMA : 0)
-// CHECK-DAG:       %[[EXT_BUFFER_IN:.*]] = aie.external_buffer {sym_name = "ext_buffer_in"} : memref<64xi32>
-// CHECK-DAG:       aie.shim_dma_allocation @of_in(MM2S, 0, 2)
-// CHECK:           %[[MEM_2_2:.*]] = aie.mem(%[[TILE_2_2]]) {
-// CHECK:             %[[VAL_0:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OF_IN_0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF_IN_0_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF_IN_0_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OF_IN_0_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF_IN_0_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF_IN_0_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_2_3:.*]] = aie.mem(%[[TILE_2_3]]) {
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OF_IN_1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF_IN_1_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF_IN_1_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OF_IN_1_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF_IN_1_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF_IN_1_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_3_3:.*]] = aie.mem(%[[TILE_3_3]]) {
-// CHECK:             %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[OF_IN_2_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF_IN_2_CONS_BUFF_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF_IN_2_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[OF_IN_2_CONS_PROD_LOCK]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[OF_IN_2_CONS_BUFF_1]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[OF_IN_2_CONS_CONS_LOCK]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb3:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-module @shim_broadcast {
-   aie.device(xcve2302) {
-      %tile20 = aie.tile(2, 0)
-      %tile22 = aie.tile(2, 2)
-      %tile23 = aie.tile(2, 3)
-      %tile33 = aie.tile(3, 3)
-      aie.flow(%tile20, DMA : 0, %tile33, DMA : 0) {symbol = @of_in}
-      aie.flow(%tile20, DMA : 0, %tile23, DMA : 0) {symbol = @of_in}
-      aie.flow(%tile20, DMA : 0, %tile22, DMA : 0) {symbol = @of_in}
-      aie.objectfifo @of_in (%tile20, {%tile22, %tile23, %tile33}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
-      %ext_buffer_in  = aie.external_buffer {sym_name = "ext_buffer_in"}: memref<64xi32>
-      aie.objectfifo.register_external_buffers @of_in (%tile20, {%ext_buffer_in}) : (memref<64xi32>)
-   }
-}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/subview_test_1.mlir b/compiler/plugins/target/AMD-AIE/aie/test/subview_test_1.mlir
deleted file mode 100644
index 44ef00a9a..000000000
--- a/compiler/plugins/target/AMD-AIE/aie/test/subview_test_1.mlir
+++ /dev/null
@@ -1,132 +0,0 @@
-
-// RUN: iree-opt --amdaie-objectFifo-stateful-transform %s | FileCheck %s
-
-// CHECK-LABEL:   aie.device(npu1_4col) {
-// CHECK:           memref.global "public" @objfifo : memref<16xi32>
-// CHECK-DAG:       %[[TILE_1_2:.*]] = aie.tile(1, 2)
-// CHECK-DAG:       %[[TILE_1_3:.*]] = aie.tile(1, 3)
-// CHECK-DAG:       %[[BUFFER_1_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "objfifo_prod_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_0:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "objfifo_prod_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_1:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "objfifo_prod_buff_0_2"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_2_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "objfifo_prod_buff_0_3"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_1_2:.*]] = aie.lock(%[[TILE_1_2]]) {init = 4 : i8, sym_name = "objfifo_prod_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_1_2_3:.*]] = aie.lock(%[[TILE_1_2]]) {init = 0 : i8, sym_name = "objfifo_prod_cons_lock_0"}
-// CHECK-DAG:       %[[BUFFER_1_3:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "objfifo_cons_buff_0_0"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_3_4:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "objfifo_cons_buff_0_1"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_3_5:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "objfifo_cons_buff_0_2"} : memref<16xi32>
-// CHECK-DAG:       %[[BUFFER_1_3_6:.*]] = aie.buffer(%[[TILE_1_3]]) {sym_name = "objfifo_cons_buff_0_3"} : memref<16xi32>
-// CHECK-DAG:       %[[LOCK_1_3:.*]] = aie.lock(%[[TILE_1_3]]) {init = 4 : i8, sym_name = "objfifo_cons_prod_lock_0"}
-// CHECK-DAG:       %[[LOCK_1_3_7:.*]] = aie.lock(%[[TILE_1_3]]) {init = 0 : i8, sym_name = "objfifo_cons_cons_lock_0"}
-// CHECK-DAG:       aie.flow(%[[TILE_1_2]], DMA : 0, %[[TILE_1_3]], DMA : 0) {symbol = @objfifo}
-// CHECK:           func.func @some_work(%[[ARG0:.*]]: memref<16xi32>) {
-// CHECK:             return
-// CHECK:           }
-// CHECK:           %[[CORE_1_2:.*]] = aie.core(%[[TILE_1_2]]) {
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 3)
-// CHECK:             func.call @some_work(%[[BUFFER_1_2]]) : (memref<16xi32>) -> ()
-// CHECK:             func.call @some_work(%[[BUFFER_1_2_0]]) : (memref<16xi32>) -> ()
-// CHECK:             func.call @some_work(%[[BUFFER_1_2_1]]) : (memref<16xi32>) -> ()
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1)
-// CHECK:             func.call @some_work(%[[BUFFER_1_2_2]]) : (memref<16xi32>) -> ()
-// CHECK:             aie.use_lock(%[[LOCK_1_2_3]], Release, 3)
-// CHECK:             aie.use_lock(%[[LOCK_1_2_3]], Release, 1)
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 2)
-// CHECK:             func.call @some_work(%[[BUFFER_1_2]]) : (memref<16xi32>) -> ()
-// CHECK:             func.call @some_work(%[[BUFFER_1_2_0]]) : (memref<16xi32>) -> ()
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 2)
-// CHECK:             func.call @some_work(%[[BUFFER_1_2_1]]) : (memref<16xi32>) -> ()
-// CHECK:             func.call @some_work(%[[BUFFER_1_2_2]]) : (memref<16xi32>) -> ()
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) {
-// CHECK:             %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb5)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_0]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_1]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[LOCK_1_2_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_2_2]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_2]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb5:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:           %[[MEM_1_3:.*]] = aie.mem(%[[TILE_1_3]]) {
-// CHECK:             %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb5)
-// CHECK:           ^bb1:
-// CHECK:             aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_3]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_3_7]], Release, 1)
-// CHECK:             aie.next_bd ^bb2
-// CHECK:           ^bb2:
-// CHECK:             aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_3_4]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_3_7]], Release, 1)
-// CHECK:             aie.next_bd ^bb3
-// CHECK:           ^bb3:
-// CHECK:             aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_3_5]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_3_7]], Release, 1)
-// CHECK:             aie.next_bd ^bb4
-// CHECK:           ^bb4:
-// CHECK:             aie.use_lock(%[[LOCK_1_3]], AcquireGreaterEqual, 1)
-// CHECK:             aie.dma_bd(%[[BUFFER_1_3_6]] : memref<16xi32>) {len = 16 : i32}
-// CHECK:             aie.use_lock(%[[LOCK_1_3_7]], Release, 1)
-// CHECK:             aie.next_bd ^bb1
-// CHECK:           ^bb5:
-// CHECK:             aie.end
-// CHECK:           }
-// CHECK:         }
-module @singleFifo {
-    aie.device(npu1_4col) {
-        %tile12 = aie.tile(1, 2)
-        %tile13 = aie.tile(1, 3)
-        aie.flow(%tile12, DMA : 0, %tile13, DMA : 0) {symbol = @objfifo}
-        aie.objectfifo @objfifo (%tile12, {%tile13}, 4 : i32) : !aie.objectfifo<memref<16xi32>>
-        func.func @some_work(%line_in:memref<16xi32>) -> () {
-            return
-        }
-        %core12 = aie.core(%tile12) {
-            // this acquires 2 elements
-            %subview0 = aie.objectfifo.acquire @objfifo (Produce, 3) : !aie.objectfifosubview<memref<16xi32>>
-            %elem00 = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            %elem01 = aie.objectfifo.subview.access %subview0[1] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            %elem02 = aie.objectfifo.subview.access %subview0[2] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            func.call @some_work(%elem00) : (memref<16xi32>) -> ()
-            func.call @some_work(%elem01) : (memref<16xi32>) -> ()
-            func.call @some_work(%elem02) : (memref<16xi32>) -> ()
-            // this should only acquire one new element, previous two are still acquired
-            %subview1 = aie.objectfifo.acquire @objfifo (Produce, 1) : !aie.objectfifosubview<memref<16xi32>>
-            %elem10 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            func.call @some_work(%elem10) : (memref<16xi32>) -> ()
-            // one new acquire should take place
-            aie.objectfifo.release @objfifo (Produce, 3)
-            aie.objectfifo.release @objfifo (Produce, 1)
-            %subview2 = aie.objectfifo.acquire @objfifo (Produce, 2) : !aie.objectfifosubview<memref<16xi32>>
-            %elem20 = aie.objectfifo.subview.access %subview2[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            %elem21 = aie.objectfifo.subview.access %subview2[1] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            func.call @some_work(%elem20) : (memref<16xi32>) -> ()
-            func.call @some_work(%elem21) : (memref<16xi32>) -> ()
-            // no new acquires should take place, elem30 should be third element of objFifo (with index 2)
-            %subview3 = aie.objectfifo.acquire @objfifo (Produce, 2) : !aie.objectfifosubview<memref<16xi32>>
-            %elem30 = aie.objectfifo.subview.access %subview3[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            %elem31 = aie.objectfifo.subview.access %subview3[1] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-            //%elem32 = aie.subview.access %subview3[2] : !aie.subview<memref<16xi32>> -> memref<16xi32> // expected to fail if this line is uncommented
-            func.call @some_work(%elem30) : (memref<16xi32>) -> ()
-            func.call @some_work(%elem31) : (memref<16xi32>) -> ()
-            aie.end
-        }
-    }
-}
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td
index 5a63a7a2a..651384926 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td
@@ -21,6 +21,17 @@ def AMDAIE_CopyOpOperateOn: I32EnumAttr<"CopyOpOperateOn",
 {
 }
 
+def AMDAIE_LockAction: I32EnumAttr<"LockAction",
+  "The action to be performed on a lock",
+  [
+    I32EnumAttrCase<"Acquire", 0>,
+    I32EnumAttrCase<"AcquireGreaterOrEqual", 1>,
+    I32EnumAttrCase<"Release", 2> 
+  ]
+  > {
+  let cppNamespace = "mlir::iree_compiler::AMDAIE";
+}
+
 def LogicalObjectFifoPort: I32EnumAttr<"LogicalObjectFifoPort",
   "The logical objectfifo ports.",
   [
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
index da869deed..78d4d05ed 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
@@ -50,6 +50,12 @@ void ChannelOp::getAsmResultNames(
   setNameFn(getResult(), "channel");
 }
 
+TileOp ChannelOp::getTileOp() {
+  auto res = dyn_cast_if_present<TileOp>(getTile().getDefiningOp());
+  assert(res && "`amdaie.channel` expects an `amdaie.tile` as tile operand");
+  return res;
+}
+
 //===----------------------------------------------------------------------===//
 // AMDAIE_ControlCodeOp
 //===----------------------------------------------------------------------===//
@@ -101,7 +107,9 @@ LogicalResult CoreOp::verify() {
 }
 
 TileOp CoreOp::getTileOp() {
-  return dyn_cast_if_present<TileOp>(getTile().getDefiningOp());
+  auto res = dyn_cast_if_present<TileOp>(getTile().getDefiningOp());
+  assert(res && "`amdaie.core` expects an `amdaie.tile` as tile operand");
+  return res;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
index 7bdfabb7e..271b596b3 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
@@ -229,6 +229,40 @@ def AMDAIE_LockOp: AMDAIE_Op<"lock", [
   let assemblyFormat = [{ `(` $tile `(` $value `)` (`,` $init_value^)? `)` attr-dict }];
 }
 
+def AMDAIE_UseLockOp: AMDAIE_Op<"use_lock"> {
+  let summary = "Represents the use of a semaphore lock with a specified "
+                "action (acquire/release).";
+  let description = [{
+    This operation represents the use of a semaphore lock with a specified lock 
+    `action` and `value`. The lock action could for example be `Acquire`, 
+    `AcquireGreaterOrEqual` or `Release`. The specified `value` argument 
+    determines the value to be used in the lock action, for example:
+    - `Acquire(1)`: Acquire the lock if its value is equal to 1, then subtract 1
+      from it.
+    - `AcquireGreaterOrEqual(1)`: Acquire the lock if its value is greater or 
+      equal to 1, then subtract 1 from it.
+    - `Release(1)`: Add 1 to the value of this lock.
+    
+
+    Example:
+
+    ```mlir
+    %lock = amdaie.lock(%tile, %c0)
+    %0 = amdaie.use_lock(%lock, 0)
+    ```
+  }];
+
+  let arguments = (
+    ins Index:$lock,
+        AMDAIE_LockAction:$action,
+        I8Attr:$value
+  );
+
+  let assemblyFormat = [{ 
+    `(` $lock `,` $action `(` $value `)` `)` attr-dict
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // IREE AMDAIE DMA Utility Ops
 //===----------------------------------------------------------------------===//
@@ -294,6 +328,10 @@ def AMDAIE_ChannelOp: AMDAIE_Op<"channel", [
         ConfinedAttr<I8Attr, [IntMinValue<0>]>:$value
   );
 
+  let extraClassDeclaration = [{
+    TileOp getTileOp();
+  }];
+
   let assemblyFormat = [{ `(` $tile `,` $value `)` attr-dict }];
 }
 
@@ -876,7 +914,7 @@ def AMDAIE_LogicalObjectFifoFromBuffersOp
       return cast<LogicalObjectFifoType>(getOutput().getType())
         .getElementType();
     }
-
+  
     // Return the encapsulated buffers on the requested tile.
     llvm::SmallVector<BufferOp> getBuffersOnTile(TileOp tileOp);
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp
index 50d72b077..39c044d59 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp
@@ -27,11 +27,9 @@ struct AMDAIESession
     AMDAIE::registerAMDAIEPasses();
     AMDAIE::registerAMDAIEAssignBufferAddressesBasic();
     AMDAIE::registerAMDAIEAssignBufferDescriptorIDs();
-    AMDAIE::registerAMDAIEAssignLockIDs();
     AMDAIE::registerAMDAIECoreToStandard();
     AMDAIE::registerAMDAIELocalizeLocks();
     AMDAIE::registerAMDAIENormalizeAddressSpaces();
-    AMDAIE::registerAMDAIEObjectFifoStatefulTransform();
     AMDAIE::registerAMDAIERoutePathfinderFlows();
     AMDAIE::registerAMDAIEDmaToNpu();
     AMDAIE::registerAIRConversionPasses();
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAcquireReleaseToUseLock.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAcquireReleaseToUseLock.cpp
new file mode 100644
index 000000000..652cb2efb
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAcquireReleaseToUseLock.cpp
@@ -0,0 +1,234 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <numeric>
+
+#include "iree-amd-aie/IR/AMDAIEOps.h"
+#include "iree-amd-aie/Transforms/Passes.h"
+#include "iree-amd-aie/Transforms/Transforms.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Support/MathExtras.h"
+#include "mlir/Dialect/SCF/Transforms/Transforms.h"
+#include "mlir/Dialect/SCF/Utils/Utils.h"
+
+#define DEBUG_TYPE "iree-amdaie-acquire-release-to-use-lock"
+
+namespace mlir::iree_compiler::AMDAIE {
+
+template <typename T>
+FailureOr<AMDAIE::LogicalObjectFifoFromBuffersOp> getLogicalObjFifoOperatedOn(
+    T op) {
+  auto copyOp =
+      dyn_cast_if_present<CopyOpInterface>(op.getDma().getDefiningOp());
+  if (!copyOp)
+    return op.emitOpError() << "should operate on a copy-like operation";
+  auto logicalObjFifo =
+      op.getPort() == LogicalObjectFifoPort::Consume
+          ? dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromBuffersOp>(
+                copyOp.getTarget().getDefiningOp())
+          : dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromBuffersOp>(
+                copyOp.getSource().getDefiningOp());
+  if (!logicalObjFifo) {
+    return copyOp.emitOpError()
+           << "should operate on an `amdaie.logicalobjectfifo.from_buffers` op";
+  }
+  return logicalObjFifo;
+}
+
+/// Unroll the scf.for loops inside the core operations based on the depths of
+/// the acquired objFifos.
+LogicalResult coreLoopUnroll(RewriterBase &rewriter, AMDAIE::CoreOp coreOp) {
+  WalkResult res = coreOp.walk([&](scf::ForOp forOp) {
+    llvm::SmallDenseSet<uint8_t> depths;
+    for (auto acqOp :
+         forOp.getBody()->getOps<AMDAIE::LogicalObjectFifoAcquire>()) {
+      FailureOr<AMDAIE::LogicalObjectFifoFromBuffersOp> maybeLogicalObjFifo =
+          getLogicalObjFifoOperatedOn<AMDAIE::LogicalObjectFifoAcquire>(acqOp);
+      if (failed(maybeLogicalObjFifo)) return WalkResult::interrupt();
+      AMDAIE::LogicalObjectFifoFromBuffersOp logicalObjFifo =
+          maybeLogicalObjFifo.value();
+      depths.insert(logicalObjFifo.getDepth());
+    }
+    int unrollFactor =
+        std::accumulate(depths.begin(), depths.end(), 1, std::lcm<int, int>);
+    if (unrollFactor > 1 &&
+        failed(mlir::loopUnrollByFactor(forOp, unrollFactor))) {
+      forOp.emitOpError() << "could not be unrolled with unrollFactor: "
+                          << unrollFactor << "\n";
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+  if (res.wasInterrupted()) return failure();
+  return success();
+}
+
+FailureOr<AMDAIE::LockOp> getLockToBeUsed(
+    AMDAIE::LogicalObjectFifoFromBuffersOp logicalObjFifo,
+    AMDAIE::TileOp tileOp, LogicalObjectFifoPort port, LockAction lockAction) {
+  // Retrieve the lock to be used based on the port and lock action.
+  SmallVector<AMDAIE::LockOp> consumerLocks =
+      logicalObjFifo.getConsumerLocksOnTile(tileOp);
+  if (consumerLocks.size() != 1) {
+    return logicalObjFifo.emitOpError()
+           << "expected a single consumer lock for tile: "
+           << tileOp.getResult();
+  }
+  SmallVector<AMDAIE::LockOp> producerLocks =
+      logicalObjFifo.getProducerLocksOnTile(tileOp);
+  if (producerLocks.size() != 1) {
+    return logicalObjFifo.emitOpError()
+           << "expected a single producer lock for tile: "
+           << tileOp.getResult();
+  }
+  AMDAIE::LockOp lockOp;
+  if (lockAction == LockAction::Acquire ||
+      lockAction == LockAction::AcquireGreaterOrEqual) {
+    lockOp = port == LogicalObjectFifoPort::Consume ? consumerLocks[0]
+                                                    : producerLocks[0];
+  } else if (lockAction == LockAction::Release) {
+    lockOp = port == LogicalObjectFifoPort::Consume ? producerLocks[0]
+                                                    : consumerLocks[0];
+  } else {
+    return logicalObjFifo.emitOpError()
+           << "used in unsupported lock action: " << stringifyEnum(lockAction);
+  }
+  return lockOp;
+}
+
+LogicalResult acquireToUseLock(RewriterBase &rewriter, AMDAIE::CoreOp coreOp) {
+  OpBuilder::InsertionGuard g(rewriter);
+  AMDAIE::TileOp tileOp = coreOp.getTileOp();
+  DenseMap<AMDAIE::LogicalObjectFifoFromBuffersOp, size_t>
+      logicalObjFifoToIndex;
+  SmallVector<Operation *> toBeErased;
+  WalkResult res = coreOp.walk([&](AMDAIE::LogicalObjectFifoAcquire acqOp) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Convert acquire op: " << acqOp.getOutput() << "\n");
+    std::optional<int> maybeAcqSize = acqOp.getSize();
+    assert(maybeAcqSize && maybeAcqSize.value() == 1 &&
+           "logic currently only handles size set and equal to 1");
+    int acqSize = maybeAcqSize.value();
+
+    FailureOr<AMDAIE::LogicalObjectFifoFromBuffersOp> maybeLogicalObjFifo =
+        getLogicalObjFifoOperatedOn<AMDAIE::LogicalObjectFifoAcquire>(acqOp);
+    if (failed(maybeLogicalObjFifo)) return WalkResult::interrupt();
+    AMDAIE::LogicalObjectFifoFromBuffersOp logicalObjFifo =
+        maybeLogicalObjFifo.value();
+
+    FailureOr<AMDAIE::LockOp> maybeLockOp =
+        getLockToBeUsed(logicalObjFifo, tileOp, acqOp.getPort(),
+                        LockAction::AcquireGreaterOrEqual);
+    if (failed(maybeLockOp)) return WalkResult::interrupt();
+
+    rewriter.setInsertionPoint(acqOp);
+    rewriter.create<AMDAIE::UseLockOp>(acqOp.getLoc(), maybeLockOp.value(),
+                                       LockAction::AcquireGreaterOrEqual,
+                                       acqSize);
+
+    // Rotate through buffers based on access index.
+    SmallVector<AMDAIE::BufferOp> buffers =
+        logicalObjFifo.getBuffersOnTile(tileOp);
+    if (!logicalObjFifoToIndex.contains(logicalObjFifo))
+      logicalObjFifoToIndex[logicalObjFifo] = 0;
+    size_t bufferIndex = logicalObjFifoToIndex[logicalObjFifo] % buffers.size();
+    for (Operation *userOp : acqOp->getUsers()) {
+      auto accessOp = dyn_cast<AMDAIE::LogicalObjectFifoAccessOp>(userOp);
+      if (!accessOp) {
+        acqOp.emitOpError() << "currently only supports "
+                               "`amdaie.logicalobjectfifo.access` users";
+        return WalkResult::interrupt();
+      }
+      AMDAIE::BufferOp bufferOp = buffers[bufferIndex];
+      accessOp.getResult().replaceAllUsesWith(bufferOp.getResult());
+      toBeErased.push_back(accessOp);
+    }
+    logicalObjFifoToIndex[logicalObjFifo] += acqSize;
+    toBeErased.push_back(acqOp);
+    return WalkResult::advance();
+  });
+  if (res.wasInterrupted()) return failure();
+  for (Operation *op : toBeErased) {
+    op->dropAllUses();
+    rewriter.eraseOp(op);
+  }
+  return success();
+}
+
+LogicalResult releaseToUseLock(RewriterBase &rewriter, AMDAIE::CoreOp coreOp) {
+  OpBuilder::InsertionGuard g(rewriter);
+  AMDAIE::TileOp tileOp = coreOp.getTileOp();
+  SmallVector<Operation *> toBeErased;
+  WalkResult res = coreOp.walk([&](AMDAIE::LogicalObjectFifoRelease relOp) {
+    LLVM_DEBUG(llvm::dbgs() << "Convert release op: " << relOp << "\n");
+    std::optional<int> maybeRelSize = relOp.getSize();
+    assert(maybeRelSize && maybeRelSize.value() == 1 &&
+           "logic currently only handles size set and equal to 1");
+    int relSize = maybeRelSize.value();
+
+    FailureOr<AMDAIE::LogicalObjectFifoFromBuffersOp> maybeLogicalObjFifo =
+        getLogicalObjFifoOperatedOn<AMDAIE::LogicalObjectFifoRelease>(relOp);
+    if (failed(maybeLogicalObjFifo)) return WalkResult::interrupt();
+
+    FailureOr<AMDAIE::LockOp> maybeLockOp =
+        getLockToBeUsed(maybeLogicalObjFifo.value(), tileOp, relOp.getPort(),
+                        LockAction::Release);
+    if (failed(maybeLockOp)) return WalkResult::interrupt();
+
+    rewriter.setInsertionPoint(relOp);
+    rewriter.create<AMDAIE::UseLockOp>(relOp.getLoc(), maybeLockOp.value(),
+                                       LockAction::Release, relSize);
+    toBeErased.push_back(relOp);
+    return WalkResult::advance();
+  });
+  if (res.wasInterrupted()) return failure();
+  for (Operation *op : toBeErased) {
+    op->dropAllUses();
+    rewriter.eraseOp(op);
+  }
+  return success();
+}
+
+namespace {
+
+struct AMDAIEAcquireReleaseToUseLockPass
+    : public impl::AMDAIEAcquireReleaseToUseLockBase<
+          AMDAIEAcquireReleaseToUseLockPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AMDAIEDialect>();
+  }
+
+  void runOnOperation() override {
+    Operation *parentOp = getOperation();
+    IRRewriter rewriter(parentOp->getContext());
+
+    WalkResult res = parentOp->walk([&](AMDAIE::CoreOp coreOp) {
+      // Loops need to be unrolled based on on the depths of the logical
+      // objectFifos so `amdaie.use_lock` ops can be inserted correctly for
+      // double buffering purposes, without need for a dependency on the loop
+      // induction variable.
+      if (failed(coreLoopUnroll(rewriter, coreOp))) {
+        return WalkResult::interrupt();
+      }
+      if (failed(acquireToUseLock(rewriter, coreOp))) {
+        return WalkResult::interrupt();
+      }
+      if (failed(releaseToUseLock(rewriter, coreOp))) {
+        return WalkResult::interrupt();
+      }
+      return WalkResult::advance();
+    });
+    if (res.wasInterrupted()) return signalPassFailure();
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<Pass> createAMDAIEAcquireReleaseToUseLockPass() {
+  return std::make_unique<AMDAIEAcquireReleaseToUseLockPass>();
+}
+
+}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECoreLoopUnroll.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECoreLoopUnroll.cpp
deleted file mode 100644
index b25fe3553..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECoreLoopUnroll.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright 2024 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include <numeric>
-
-#include "iree-amd-aie/IR/AMDAIEOps.h"
-#include "iree-amd-aie/Transforms/Passes.h"
-#include "iree-amd-aie/Transforms/Transforms.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/Support/MathExtras.h"
-#include "mlir/Dialect/SCF/Transforms/Transforms.h"
-#include "mlir/Dialect/SCF/Utils/Utils.h"
-
-#define DEBUG_TYPE "iree-amdaie-core-loop-unroll"
-
-namespace mlir::iree_compiler::AMDAIE {
-
-/// Unroll the scf.for loops inside the core operations based on the depths of
-/// the acquired objFifos.
-LogicalResult coreLoopUnroll(RewriterBase &rewriter, AMDAIE::CoreOp coreOp) {
-  WalkResult res = coreOp.walk([&](scf::ForOp forOp) {
-    llvm::SmallDenseSet<uint8_t> depths;
-    for (auto acqOp :
-         forOp.getBody()->getOps<AMDAIE::LogicalObjectFifoAcquire>()) {
-      auto copyOp =
-          dyn_cast_if_present<CopyOpInterface>(acqOp.getDma().getDefiningOp());
-      if (!copyOp) {
-        acqOp.emitOpError() << "should operate on a copy-like operation";
-        return WalkResult::interrupt();
-      }
-      auto logicalObjFifo =
-          acqOp.getPort() == LogicalObjectFifoPort::Consume
-              ? dyn_cast_if_present<AMDAIE::LogicalObjFifoOpInterface>(
-                    copyOp.getTarget().getDefiningOp())
-              : dyn_cast_if_present<AMDAIE::LogicalObjFifoOpInterface>(
-                    copyOp.getSource().getDefiningOp());
-      depths.insert(logicalObjFifo.getDepth());
-    }
-    int unrollFactor =
-        std::accumulate(depths.begin(), depths.end(), 1, std::lcm<int, int>);
-    if (unrollFactor > 1 &&
-        failed(mlir::loopUnrollByFactor(forOp, unrollFactor))) {
-      forOp.emitOpError() << "could not be unrolled with unrollFactor: "
-                          << unrollFactor << "\n";
-      return WalkResult::interrupt();
-    }
-    return WalkResult::advance();
-  });
-  if (res.wasInterrupted()) return failure();
-  return success();
-}
-
-namespace {
-
-struct AMDAIECoreLoopUnrollPass
-    : public impl::AMDAIECoreLoopUnrollBase<AMDAIECoreLoopUnrollPass> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<AMDAIEDialect>();
-  }
-
-  void runOnOperation() override {
-    Operation *parentOp = getOperation();
-    IRRewriter rewriter(parentOp->getContext());
-
-    WalkResult res = parentOp->walk([&](AMDAIE::CoreOp coreOp) {
-      if (failed(coreLoopUnroll(rewriter, coreOp))) {
-        return WalkResult::interrupt();
-      }
-      return WalkResult::advance();
-    });
-    if (res.wasInterrupted()) return signalPassFailure();
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<Pass> createAMDAIECoreLoopUnrollPass() {
-  return std::make_unique<AMDAIECoreLoopUnrollPass>();
-}
-
-}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp
index 1736f0879..ad04e6ecf 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp
@@ -11,13 +11,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AMDAIELowerToAIE.h"
+
 #include <memory>
 #include <numeric>
 
 #include "aie/AIEDialect.h"
 #include "aie/AIEXDialect.h"
+#include "iree-amd-aie/IR/AMDAIEAttrs.h"
 #include "iree-amd-aie/IR/AMDAIEDialect.h"
 #include "iree-amd-aie/IR/AMDAIEOps.h"
+#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h"
 #include "iree-amd-aie/Transforms/AMDAIEUtils.h"
 #include "iree-amd-aie/Transforms/Passes.h"
 #include "llvm/ADT/STLExtras.h"
@@ -33,55 +37,38 @@ using namespace xilinx;
 
 namespace mlir::iree_compiler::AMDAIE {
 
-namespace {
-
-/// Utility to remap the provided operation's operands.
-void remapOperands(Operation *op, IRMapping &mapper) {
-  for (int i = 0; i < op->getNumOperands(); ++i) {
-    Value operand = op->getOperand(i);
-    if (mapper.contains(operand)) {
-      op->setOperand(i, mapper.lookup(operand));
-    }
-  }
-}
-
-/// It is dangerous to erase ops with `rewriter` without erasing them from
-/// `mapper` too, as addresses of Operations/Values can be reused, resulting in
-/// unexpected key-value pairs in `mapper`. Use this utility if `mapper` might
-/// be used after `op` is erased.
-void eraseOp(IRRewriter &rewriter, IRMapping &mapper, Operation *op) {
-  for (Value result : op->getResults()) {
-    mapper.erase(result);
-  }
-  mapper.erase(op);
-  op->dropAllUses();
-  rewriter.eraseOp(op);
-}
-
 //===----------------------------------------------------------------------===//
-// Convert amdaie.core operation to aie.core
+// AIEDeviceBuilder utilities
 //===----------------------------------------------------------------------===//
 
-/// Utility to convert vectors of `size` and `stride` into an
-/// `AIE::BDDimLayoutArrayAttr`.
-AIE::BDDimLayoutArrayAttr convertSizeStrideToBDDimLayoutArrayAttr(
-    IRRewriter &rewriter, const SmallVector<OpFoldResult> &sizes,
+AIE::BDDimLayoutArrayAttr
+AIEDeviceBuilder::convertSizeStrideToBDDimLayoutArrayAttr(
+    const SmallVector<OpFoldResult> &sizes,
     const SmallVector<OpFoldResult> &strides) {
   assert(sizes.size() == strides.size() &&
          "expected stride and size vectors of same size");
+  // Fold remaining dimensions, assuming zero offsets as offsets should be taken
+  // care of separately.
+  SmallVector<OpFoldResult> offsets(
+      strides.size(), getAsIndexOpFoldResult(rewriter.getContext(), 0));
+  SmallVector<OpFoldResult> newOffsets;
+  SmallVector<OpFoldResult> newSizes;
+  SmallVector<OpFoldResult> newStrides;
+  foldDims(offsets, sizes, strides, newOffsets, newSizes, newStrides);
+
   SmallVector<AIE::BDDimLayoutAttr, 4> bdDimLayoutAttr;
   // If the access pattern (strides/sizes) have a single dimension, make it
   // implicit with an empty `BDDimLayoutAttr` as this is what the AIE dialect
   // expects.
-  if (strides.size() == 1) {
-    std::optional<int64_t> stride = getConstantIntValue(strides[0]);
+  if (newStrides.size() == 1) {
+    std::optional<int64_t> stride = getConstantIntValue(newStrides[0]);
     if (stride && stride.value() == 1) {
       return AIE::BDDimLayoutArrayAttr::get(rewriter.getContext(),
                                             ArrayRef(bdDimLayoutAttr));
     }
   }
-  bdDimLayoutAttr.reserve(sizes.size());
-  for (auto [size, stride] : llvm::zip(sizes, strides)) {
+  bdDimLayoutAttr.reserve(newSizes.size());
+  for (auto [size, stride] : llvm::zip(newSizes, newStrides)) {
     bdDimLayoutAttr.push_back(AIE::BDDimLayoutAttr::get(
         rewriter.getContext(), getConstantIntValue(size).value(),
         getConstantIntValue(stride).value()));
@@ -90,226 +77,128 @@ AIE::BDDimLayoutArrayAttr convertSizeStrideToBDDimLayoutArrayAttr(
                                         ArrayRef(bdDimLayoutAttr));
 }
 
-/// Utility to create an `aie.objectfifo` operation from
-/// `amdaie.circular_dma_cpy_nd`.
-FailureOr<AIE::ObjectFifoCreateOp> createObjectFifo(
-    IRRewriter &rewriter, AMDAIE::ConnectionOp connectionOp, IRMapping &mapper,
-    AMDAIE::NpuCircularDmaCpyNdOp dmaOp, Value srcTile, ValueRange dstTiles,
-    StringAttr &symName) {
-  OpBuilder::InsertionGuard guard(rewriter);
-  auto sourceType =
-      cast<AMDAIE::LogicalObjectFifoType>(connectionOp.getSource().getType());
-  auto targetType =
-      cast<AMDAIE::LogicalObjectFifoType>(connectionOp.getTarget().getType());
-  uint8_t sourceMemSpace = sourceType.getMemorySpaceAsUInt();
-  uint8_t targetMemSpace = targetType.getMemorySpaceAsUInt();
-  unsigned depth;
-  unsigned sourceDepth = sourceType.getDepth();
-  unsigned targetDepth = targetType.getDepth();
-  if (sourceMemSpace == 0 && targetMemSpace == 0) {
-    return connectionOp.emitOpError()
-           << "both source and target on main memory not supported";
-  } else if (sourceMemSpace == 0) {
-    depth = targetDepth;
-  } else if (targetMemSpace == 0) {
-    depth = sourceDepth;
-  } else {
-    if (sourceDepth != targetDepth)
-      return connectionOp.emitOpError()
-             << "unsupported sourceDepth != targetDepth";
-    depth = sourceDepth;
-  }
-
-  SmallVector<AMDAIE::ChannelOp> producerChannels;
-  SmallVector<AMDAIE::ChannelOp> consumerChannels;
-  for (Value producerChannel : connectionOp.getSourceChannels()) {
-    auto channelOp =
-        dyn_cast<AMDAIE::ChannelOp>(producerChannel.getDefiningOp());
-    if (!channelOp) {
-      return connectionOp.emitOpError()
-             << "found non-`amdaie.channel` source channel";
+/// Create a new `aie.dma_start` op with a sequence of DMA BD blocks within the
+/// provided `memOp`.
+///
+/// Example of a S2MM DMA start op being created with two DMA blocks performing
+/// a circular double buffering DMA operation:
+///
+///  %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
+///    %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
+///  ^bb1:  // 2 preds: ^bb0, ^bb2
+///    aie.use_lock(%lock_0_1_51, AcquireGreaterEqual, 2)
+///    aie.dma_bd(%buffer_0_1_49 : memref<2048xi32, 1 : i32>) {len = 2048 : i32}
+///    aie.use_lock(%lock_0_1_52, Release, 2)
+///    aie.next_bd ^bb2
+///  ^bb2:  // pred: ^bb1
+///    aie.use_lock(%lock_0_1_51, AcquireGreaterEqual, 2)
+///    aie.dma_bd(%buffer_0_1_50 : memref<2048xi32, 1 : i32>) {len = 2048 : i32}
+///    aie.use_lock(%lock_0_1_52, Release, 2)
+///    aie.next_bd ^bb1
+void AIEDeviceBuilder::createDMA(
+    Operation *memOp, AIE::DMAChannelDir channelDir, int channelIndex,
+    AIE::BDDimLayoutArrayAttr dims, size_t acqNum, size_t relNum, int64_t len,
+    int64_t offset, const SmallVector<AIE::BufferOp> &bufferOps,
+    const std::pair<AIE::LockOp, AIE::LockOp> &locks) {
+  OpBuilder::InsertionGuard g(rewriter);
+  Block &endBlock = memOp->getRegion(0).getBlocks().back();
+  assert(!endBlock.getOps<AIE::EndOp>().empty() &&
+         "expected last block to have aie.end");
+  Block *lastDmaBlock = endBlock.getSinglePredecessor(),
+        *dmaBlock = rewriter.createBlock(&endBlock),
+        *bdBlock = rewriter.createBlock(&endBlock);
+
+  // Create DMA channel.
+  rewriter.setInsertionPointToStart(dmaBlock);
+  rewriter.create<AIE::DMAStartOp>(rewriter.getUnknownLoc(), channelDir,
+                                   channelIndex, /*repeatCount*/ 0, bdBlock,
+                                   &endBlock);
+  if (lastDmaBlock) lastDmaBlock->getTerminator()->setSuccessor(dmaBlock, 1);
+
+  auto createBdBlockOps = [&](AIE::BufferOp buff, Block *succ) {
+    AIE::LockOp acqLock = locks.first, relLock = locks.second;
+    rewriter.create<AIE::UseLockOp>(rewriter.getUnknownLoc(), acqLock,
+                                    AIE::LockAction::AcquireGreaterEqual,
+                                    acqNum);
+    if (!dims.getValue().empty()) {
+      rewriter.create<AIE::DMABDOp>(rewriter.getUnknownLoc(), buff, offset, len,
+                                    dims);
+    } else {
+      rewriter.create<AIE::DMABDOp>(rewriter.getUnknownLoc(), buff, offset,
+                                    len);
     }
-    producerChannels.push_back(channelOp);
-  }
-  for (Value consumerChannel : connectionOp.getTargetChannels()) {
-    auto channelOp =
-        dyn_cast<AMDAIE::ChannelOp>(consumerChannel.getDefiningOp());
-    if (!channelOp) {
-      return connectionOp.emitOpError()
-             << "found non-`amdaie.channel` source channel";
+    rewriter.create<AIE::UseLockOp>(rewriter.getUnknownLoc(), relLock,
+                                    AIE::LockAction::Release, relNum);
+    rewriter.create<AIE::NextBDOp>(rewriter.getUnknownLoc(), succ);
+  };
+
+  // Create Bd blocks.
+  Block *succ = nullptr, *curr = bdBlock;
+  for (size_t blockIndex = 0; blockIndex < bufferOps.size(); ++blockIndex) {
+    if (blockIndex == bufferOps.size() - 1) {
+      succ = bdBlock;
+    } else {
+      succ = rewriter.createBlock(&endBlock);
     }
-    consumerChannels.push_back(channelOp);
+    rewriter.setInsertionPointToStart(curr);
+    createBdBlockOps(bufferOps[blockIndex], succ);
+    curr = succ;
   }
+}
 
-  // Convert source and target sizes and strides to `BDDimLayoutArrayAttr`s,
-  // which the `aie.objectfifo` works with.
-  AIE::BDDimLayoutArrayAttr sourceDims =
-      convertSizeStrideToBDDimLayoutArrayAttr(
-          rewriter, dmaOp.getSourceMixedSizes(), dmaOp.getSourceMixedStrides());
-
-  AIE::BDDimLayoutArrayAttr layoutAttr =
-      convertSizeStrideToBDDimLayoutArrayAttr(
-          rewriter, dmaOp.getTargetMixedSizes(), dmaOp.getTargetMixedStrides());
-  // The aie.objectfifo expects a `BDDimLayoutArrayAttr` for each consumer. A
-  // single one for all consumers will error out.
-  SmallVector<AIE::BDDimLayoutArrayAttr> targetDimsVec(dstTiles.size(),
-                                                       layoutAttr);
-
-  AIE::BDDimLayoutArrayArrayAttr targetDims =
-      AIE::BDDimLayoutArrayArrayAttr::get(rewriter.getContext(),
-                                          ArrayRef(targetDimsVec));
-
-  // For now, set data type based on source and target memory space. Use
-  // L2/MemTile type if either source or target is located on L2. Otherwise, use
-  // the most local type.
-  // TODO(jornt): Not very clear and clean, but this is to mimic how AIE
-  // objectfifos are set up and it is probably better to adjust AIE objectfifos
-  // directly to make this more clean.
-  // TODO(jornt): I think objectfifos should support source type != dest type.
-  MemRefType srcType = cast<LogicalObjectFifoType>(connectionOp.getSourceType())
-                           .getElementType();
-  MemRefType dstType = cast<LogicalObjectFifoType>(connectionOp.getTargetType())
-                           .getElementType();
-  ArrayRef<int64_t> sourceShape = srcType.getShape();
-  ArrayRef<int64_t> targetShape = dstType.getShape();
-  int64_t sourceSize = std::accumulate(sourceShape.begin(), sourceShape.end(),
-                                       1, std::multiplies<>());
-  int64_t targetSize = std::accumulate(targetShape.begin(), targetShape.end(),
-                                       1, std::multiplies<>());
-  MemRefType memrefType =
-      sourceSize < targetSize
-          ? MemRefType::get({sourceSize}, srcType.getElementType(),
-                            MemRefLayoutAttrInterface{},
-                            srcType.getMemorySpace())
-          : MemRefType::get({targetSize}, dstType.getElementType(),
-                            MemRefLayoutAttrInterface{},
-                            dstType.getMemorySpace());
-  AIE::AIEObjectFifoType dtype = AIE::AIEObjectFifoType::get(memrefType);
-  auto fifo = rewriter.create<AIE::ObjectFifoCreateOp>(
-      rewriter.getUnknownLoc(), symName, srcTile, dstTiles,
-      rewriter.getIntegerAttr(rewriter.getI32Type(), depth), dtype, sourceDims,
-      targetDims);
-
-  // Insert flow ops
-  rewriter.setInsertionPoint(fifo);
-  for (AMDAIE::ChannelOp producerChannel : producerChannels) {
-    for (AMDAIE::ChannelOp consumerChannel : consumerChannels) {
-      Value aieProducerTile = mapper.lookup(producerChannel.getTile());
-      Value aieConsumerTile = mapper.lookup(consumerChannel.getTile());
-      rewriter.create<AIE::FlowOp>(
-          rewriter.getUnknownLoc(), aieProducerTile, AIE::WireBundle::DMA,
-          producerChannel.getValue(), aieConsumerTile, AIE::WireBundle::DMA,
-          consumerChannel.getValue(), FlatSymbolRefAttr::get(fifo->getContext(), fifo.getName()));
-    }
-  }
+AIE::ShimDMAAllocationOp AIEDeviceBuilder::createShimDmaAllocation(
+    Block *deviceBlock, AMDAIE::TileOp tileOp, AIE::DMAChannelDir dmaChannelDir,
+    uint8_t channel, MemRefType memrefType, int &connectionIndex) {
+  OpBuilder::InsertionGuard g(rewriter);
+  auto shimDmaAllocOp = rewriter.create<AIE::ShimDMAAllocationOp>(
+      rewriter.getUnknownLoc(), "shim_" + std::to_string(connectionIndex++),
+      dmaChannelDir, channel, getConstantIndexOrAssert(tileOp.getCol()));
+  rewriter.setInsertionPointToStart(deviceBlock);
+  StringRef symName = shimDmaAllocOp.getSymName();
+  rewriter.create<memref::GlobalOp>(rewriter.getUnknownLoc(), symName,
+                                    rewriter.getStringAttr("public"),
+                                    memrefType, nullptr, false, nullptr);
+  return shimDmaAllocOp;
+}
 
-  return fifo;
+void AIEDeviceBuilder::eraseOp(Operation *op) {
+  for (Value result : op->getResults()) mapper.erase(result);
+  mapper.erase(op);
+  op->dropAllUses();
+  rewriter.eraseOp(op);
 }
 
-/// Convert `amdaie.logicalobjectfifo.access` to
-/// `aie.objectfifo.subview.access`, and refactor the memory space for
-/// `memref.reinterpret_cast` ops.
-LogicalResult accessOpToAIE(IRRewriter &rewriter,
-                            AMDAIE::LogicalObjectFifoAccessOp accessOp,
-                            IRMapping &mapper,
-                            SmallVector<Operation *> &toBeErased) {
-  LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::LogicalObjectFifoAccessOp]\n");
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPoint(accessOp);
-  if (!mapper.contains(accessOp.getInput())) {
-    return accessOp.emitError()
-           << "this access operation's input has not been mapped";
-  }
-  auto subviewOp = dyn_cast_if_present<AIE::ObjectFifoSubviewAccessOp>(
-      mapper.lookup(accessOp.getInput()).getDefiningOp());
-  if (!subviewOp) {
-    return accessOp.emitError()
-           << "access doesn't operate on an input that has been mapped to an "
-              "`aie.objectfifo.acquire` + subview operation";
-  }
+void AIEDeviceBuilder::foldDims(const SmallVector<OpFoldResult> &offsets,
+                                const SmallVector<OpFoldResult> &sizes,
+                                const SmallVector<OpFoldResult> &strides,
+                                SmallVector<OpFoldResult> &newOffsets,
+                                SmallVector<OpFoldResult> &newSizes,
+                                SmallVector<OpFoldResult> &newStrides) {
+  SmallVector<OpFoldResult> tmpOffsets;
+  SmallVector<OpFoldResult> tmpSizes;
+  SmallVector<OpFoldResult> tmpStrides;
+  (void)foldUnitDims(offsets, sizes, strides, tmpOffsets, tmpSizes, tmpStrides);
+  (void)foldLinearDims(rewriter.getContext(), tmpOffsets, tmpSizes, tmpStrides,
+                       newOffsets, newSizes, newStrides);
+  (void)foldSingleDim(newOffsets, newSizes, newStrides);
+}
 
-  SmallVector<memref::ReinterpretCastOp> oldReinterpretOps;
-  for (Operation *user : accessOp->getUsers()) {
-    if (isa<memref::ReinterpretCastOp>(user)) {
-      oldReinterpretOps.push_back(cast<memref::ReinterpretCastOp>(user));
+void AIEDeviceBuilder::remapOperands(Operation *op) {
+  for (int i = 0; i < op->getNumOperands(); ++i) {
+    Value operand = op->getOperand(i);
+    if (mapper.contains(operand)) {
+      op->setOperand(i, mapper.lookup(operand));
     }
   }
-  if (oldReinterpretOps.empty()) {
-    return accessOp.emitError() << "reinterpret-cast op has not been generated";
-  }
-  assert(oldReinterpretOps.size() == 1 &&
-         "expected a single reinterpret-cast op");
-  auto oldReinterpretOp = oldReinterpretOps[0];
-
-  auto type = cast<MemRefType>(oldReinterpretOp.getResult().getType());
-  MemRefType newType = MemRefType::Builder(type);
-  ArrayRef<int64_t> sizes = newType.getShape();
-  auto [strides, baseOffset] = getStridesAndOffset(newType);
-  auto reinterpretOp = rewriter.create<memref::ReinterpretCastOp>(
-      rewriter.getUnknownLoc(), newType, subviewOp.getOutput(), baseOffset,
-      sizes, strides);
-
-  mapper.map(oldReinterpretOp.getOperation(), reinterpretOp.getOperation());
-  mapper.map(oldReinterpretOp.getResult(), reinterpretOp.getResult());
-  toBeErased.push_back(accessOp);
-  toBeErased.push_back(oldReinterpretOp);
-  return success();
 }
 
-/// Convert `amdaie.logicalobjectfifo.acquire` to `aie.objectfifo.acquire`.
-/// Also insert `aie.objectfifo.subview.access` operations to access the
-/// underlying memref and bridge the gap to AIE.
-LogicalResult acquireOpToAIE(IRRewriter &rewriter,
-                             AMDAIE::LogicalObjectFifoAcquire acquireOp,
-                             IRMapping &mapper,
-                             SmallVector<Operation *> &toBeErased) {
-  LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::LogicalObjectFifoAcquire]\n");
-
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPoint(acquireOp);
-  auto connectionOp = dyn_cast_if_present<AMDAIE::ConnectionOp>(
-      acquireOp.getDma().getDefiningOp());
-  if (!connectionOp) {
-    return connectionOp.emitError()
-           << "acquire doesn't operate on a `amdaie.connection`";
-  }
-
-  auto objFifo = dyn_cast<AIE::ObjectFifoCreateOp>(
-      mapper.lookup(connectionOp.getOperation()));
-  if (!objFifo) {
-    return acquireOp.emitError()
-           << "input isn't mapped to an `aie.objectifo` operation";
-  }
-
-  auto acquireOpType = dyn_cast<LogicalObjectFifoType>(acquireOp.getType());
-  assert(acquireOpType &&
-         "Expected LogicalObjectFifoAcquire to have type "
-         "LogicalObjectFifoType");
-  MemRefType elementType = acquireOpType.getElementType();
-
-  auto subviewType = AIE::AIEObjectFifoSubviewType::get(elementType);
-  AIE::ObjectFifoPort port =
-      acquireOp.getPort() == LogicalObjectFifoPort::Produce
-          ? AIE::ObjectFifoPort::Produce
-          : AIE::ObjectFifoPort::Consume;
-  auto objFifoAquireOp = rewriter.create<AIE::ObjectFifoAcquireOp>(
-      rewriter.getUnknownLoc(), subviewType, port, objFifo.getName(), 1);
-
-  auto subviewOp = rewriter.create<AIE::ObjectFifoSubviewAccessOp>(
-      rewriter.getUnknownLoc(), elementType, objFifoAquireOp.getSubview(),
-      /* index = */ rewriter.getIntegerAttr(rewriter.getI32Type(), 0));
-
-  // Map acquire op to new acquire + subview op.
-  mapper.map(acquireOp.getOperation(), subviewOp.getOperation());
-  mapper.map(acquireOp.getResult(), subviewOp.getOutput());
-  toBeErased.push_back(acquireOp);
-  return success();
-}
+//===----------------------------------------------------------------------===//
+// Convert `amdaie.core` op to `aie.core` op.
+//===----------------------------------------------------------------------===//
 
-LogicalResult coreMemrefExtractStridedMetadataToAIE(
-    IRRewriter &rewriter,
+LogicalResult AIEDeviceBuilder::coreMemrefExtractStridedMetadataToAIE(
     memref::ExtractStridedMetadataOp extractStridedMetadataOp,
-    IRMapping &mapper, SmallVector<Operation *> &toBeErased) {
+    SmallVector<Operation *> &toBeErased) {
   LLVM_DEBUG(llvm::dbgs() << "Convert [memref.extract_strided_metadata]\n");
   OpBuilder::InsertionGuard guard(rewriter);
   rewriter.setInsertionPoint(extractStridedMetadataOp);
@@ -325,9 +214,8 @@ LogicalResult coreMemrefExtractStridedMetadataToAIE(
   return success();
 }
 
-LogicalResult coreFuncCallOpToAIE(IRRewriter &rewriter, func::CallOp oldCallOp,
-                                  IRMapping &mapper,
-                                  SmallVector<Operation *> &toBeErased) {
+LogicalResult AIEDeviceBuilder::coreFuncCallOpToAIE(
+    func::CallOp oldCallOp, SmallVector<Operation *> &toBeErased) {
   LLVM_DEBUG(llvm::dbgs() << "Convert [func.call / function declaration]\n");
   // Form new argument(s) and function type for the func.call op.
   SmallVector<Value> newArgs;
@@ -370,34 +258,32 @@ LogicalResult coreFuncCallOpToAIE(IRRewriter &rewriter, func::CallOp oldCallOp,
   return success();
 }
 
-LogicalResult coreReleaseOpToAIE(IRRewriter &rewriter,
-                                 AMDAIE::LogicalObjectFifoRelease releaseOp,
-                                 IRMapping &mapper,
-                                 SmallVector<Operation *> &toBeErased) {
-  LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::LogicalObjectFifoRelease]\n");
+LogicalResult AIEDeviceBuilder::coreUseLockToAIE(
+    AMDAIE::UseLockOp useLockOp, SmallVector<Operation *> &toBeErased) {
+  LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::UseLockOp]\n");
   OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPoint(releaseOp);
-  Operation *dmaOp = releaseOp.getDma().getDefiningOp();
-  auto objFifo = dyn_cast<AIE::ObjectFifoCreateOp>(mapper.lookup(dmaOp));
-  if (!objFifo) {
-    return releaseOp.emitError()
-           << "input isn't mapped to an `aie.objectifo` operation";
+  AIE::LockAction lockAction;
+  if (useLockOp.getAction() == AMDAIE::LockAction::AcquireGreaterOrEqual) {
+    lockAction = AIE::LockAction::AcquireGreaterEqual;
+  } else if (useLockOp.getAction() == AMDAIE::LockAction::Acquire) {
+    lockAction = AIE::LockAction::Acquire;
+  } else if (useLockOp.getAction() == AMDAIE::LockAction::Release) {
+    lockAction = AIE::LockAction::Release;
+  } else {
+    useLockOp.emitOpError() << "unsupported lock action in lowering to AIE: "
+                            << stringifyEnum(useLockOp.getAction());
   }
-  AIE::ObjectFifoPort port =
-      releaseOp.getPort() == LogicalObjectFifoPort::Produce
-          ? AIE::ObjectFifoPort::Produce
-          : AIE::ObjectFifoPort::Consume;
-  std::optional<unsigned> maybeSize = releaseOp.getSize();
-  unsigned size = maybeSize ? maybeSize.value() : 1;
-  rewriter.replaceOpWithNewOp<AIE::ObjectFifoReleaseOp>(
-      releaseOp, port, objFifo.getName(), size);
+  Value aieLock = mapper.lookup(useLockOp.getLock());
+  rewriter.create<AIE::UseLockOp>(useLockOp.getLoc(), aieLock, lockAction,
+                                  useLockOp.getValue());
+  toBeErased.push_back(useLockOp);
   return success();
 }
 
 /// Convert `amdaie.core` into `aie.core`.
-LogicalResult coreToAIE(IRRewriter &rewriter, AMDAIE::CoreOp coreOp,
-                        IRMapping &mapper, AIE::DeviceOp deviceOp,
-                        Block *deviceCoreBlock) {
+LogicalResult AIEDeviceBuilder::coreToAIE(AMDAIE::CoreOp coreOp,
+                                          AIE::DeviceOp deviceOp,
+                                          Block *deviceCoreBlock) {
   LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::CoreOp]\n");
   OpBuilder::InsertionGuard guard(rewriter);
   rewriter.setInsertionPointToEnd(deviceCoreBlock);
@@ -429,27 +315,19 @@ LogicalResult coreToAIE(IRRewriter &rewriter, AMDAIE::CoreOp coreOp,
   WalkResult walkResult = aieCoreOp.walk([&](Operation *op) {
     rewriter.setInsertionPoint(op);
     if (TypeSwitch<Operation *, LogicalResult>(op)
-            .Case<AMDAIE::LogicalObjectFifoAccessOp>([&](auto accessOp) {
-              return accessOpToAIE(rewriter, accessOp, mapper, toBeErased);
-            })
-            .Case<AMDAIE::LogicalObjectFifoAcquire>([&](auto acquireOp) {
-              return acquireOpToAIE(rewriter, acquireOp, mapper, toBeErased);
-            })
-            .Case<AMDAIE::LogicalObjectFifoRelease>([&](auto releaseOp) {
-              return coreReleaseOpToAIE(rewriter, releaseOp, mapper,
-                                        toBeErased);
-            })
             .Case<memref::ExtractStridedMetadataOp>(
                 [&](auto extractStridedMetadataOp) {
                   return coreMemrefExtractStridedMetadataToAIE(
-                      rewriter, extractStridedMetadataOp, mapper, toBeErased);
+                      extractStridedMetadataOp, toBeErased);
                 })
             .Case<func::CallOp>([&](auto oldCallOp) {
-              return coreFuncCallOpToAIE(rewriter, oldCallOp, mapper,
-                                         toBeErased);
+              return coreFuncCallOpToAIE(oldCallOp, toBeErased);
+            })
+            .Case<AMDAIE::UseLockOp>([&](auto useLockOp) {
+              return coreUseLockToAIE(useLockOp, toBeErased);
             })
             .Default([&](Operation *op) {
-              remapOperands(op, mapper);
+              remapOperands(op);
               return success();
             })
             .failed()) {
@@ -461,84 +339,28 @@ LogicalResult coreToAIE(IRRewriter &rewriter, AMDAIE::CoreOp coreOp,
     coreOp.emitError("could not convert to AIEDialect ops");
     return failure();
   }
-  for (Operation *op : toBeErased) eraseOp(rewriter, mapper, op);
+  for (Operation *op : toBeErased) eraseOp(op);
 
   mapper.map(coreOp.getResult(), aieCoreOp.getResult());
   mapper.map(coreOp.getOperation(), aieCoreOp.getOperation());
   return success();
 }
 
-}  // namespace
-
-//===----------------------------------------------------------------------===//
-// Convert amdaie.circular_dma_cpy_nd operation to aie.objectfifo
-//===----------------------------------------------------------------------===//
-
-/// Convert the `amdaie.connection` operation into bidirectional object
-/// fifos.
-LogicalResult flowToAIE(IRRewriter &rewriter, AMDAIE::ConnectionOp connectionOp,
-                        IRMapping &mapper, Block *deviceBlock, int &dmaId) {
-  LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::CircularDmaCpyNdOp]\n");
-  rewriter.setInsertionPointToEnd(deviceBlock);
-  if (!connectionOp.getSource())
-    return connectionOp.emitOpError() << "expected a source";
-  auto sourceLogicalObjFifo =
-      dyn_cast_if_present<AMDAIE::LogicalObjFifoOpInterface>(
-          connectionOp.getSource().getDefiningOp());
-  if (!sourceLogicalObjFifo)
-    return connectionOp.emitOpError() << "expected a logical objectFifo source";
-  SmallVector<Value> newSourceTiles =
-      llvm::map_to_vector(sourceLogicalObjFifo.getTiles(),
-                          [&](Value tile) { return mapper.lookup(tile); });
-  if (newSourceTiles.size() != 1) {
-    return connectionOp.emitError()
-           << "Can't create an `aie.objectfifo` from this flow operation as "
-              "`ObjectFifoCreateOp` only handles a single source tile for now, "
-              "but got: ";
-  }
-  Value newSourceTile = newSourceTiles[0];
-
-  if (!connectionOp.getTarget())
-    return connectionOp.emitOpError() << "expected a source";
-  auto targetLogicalObjFifo =
-      dyn_cast_if_present<AMDAIE::LogicalObjFifoOpInterface>(
-          connectionOp.getTarget().getDefiningOp());
-  if (!targetLogicalObjFifo)
-    return connectionOp.emitOpError() << "expected a logical objectFifo source";
-  SmallVector<Value> newTargetTiles =
-      llvm::map_to_vector(targetLogicalObjFifo.getTiles(),
-                          [&](Value tile) { return mapper.lookup(tile); });
-
-  FailureOr<AMDAIE::NpuCircularDmaCpyNdOp> npuDmaUserOp =
-      connectionOp.getNpuCircularDmaCpyNdUser();
-  if (failed(npuDmaUserOp)) return failure();
-
-  auto symName = "obj" + std::to_string(dmaId++);
-  StringAttr symAttr = rewriter.getStringAttr(symName);
-  FailureOr<AIE::ObjectFifoCreateOp> objFifo =
-      createObjectFifo(rewriter, connectionOp, mapper, npuDmaUserOp.value(),
-                       newSourceTile, newTargetTiles, symAttr);
-  if (failed(objFifo)) return failure();
-  mapper.map(connectionOp.getOperation(), objFifo.value().getOperation());
-  return success();
-}
-
 //===----------------------------------------------------------------------===//
 // Convert amdaie.controlcode operation to NPU instruction func
 //===----------------------------------------------------------------------===//
 
 /// Convert the `amdaie.npu.dma_cpy_nd` operation to `aiex.npu.dma_memcpy_nd`.
-LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter,
-                                 AMDAIE::NpuDmaCpyNdOp dmaOp,
-                                 SmallVector<Operation *> &toBeErased,
-                                 IRMapping &mapper, IRMapping &bindingsMapper) {
+LogicalResult AIEDeviceBuilder::npuDmaCpyNdOpToAIE(
+    AMDAIE::NpuDmaCpyNdOp dmaOp, SmallVector<Operation *> &toBeErased) {
+  LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::NpuDmaCpyNdOp]\n");
   AMDAIE::ConnectionOp connectionOp = dmaOp.getConnectionOp();
 
   SmallVector<Value> offsets, sizes, strides;
   ArrayRef<int64_t> staticOffsets, staticSizes, staticStrides;
   AMDAIE::BdIdOp bdIdOp;
   LogicalObjectFifoFromMemrefOp logicalObjFifo;
-
+  SmallVector<Operation *> memOps;
   // Convert bidirectional `amdaie.npu.dma_cpy_nd` op into two halves.
   if (dmaOp.getSource()) {
     offsets = dmaOp.getSourceOffsets();
@@ -558,9 +380,8 @@ LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter,
       return dmaOp.emitOpError() << "expected source to be an "
                                     "`amdaie.logicalobjectfifo.from_memref`";
     }
-  }
-
-  else if (dmaOp.getTarget()) {
+    memOps = connectionToSourceTargetMemOps[connectionOp].first;
+  } else if (dmaOp.getTarget()) {
     offsets = dmaOp.getTargetOffsets();
     sizes = dmaOp.getTargetSizes();
     strides = dmaOp.getTargetStrides();
@@ -578,23 +399,21 @@ LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter,
       return dmaOp.emitOpError() << "expected target to be an "
                                     "`amdaie.logicalobjectfifo.from_memref`";
     }
-  }
-
-  else {
+    memOps = connectionToSourceTargetMemOps[connectionOp].second;
+  } else {
     return dmaOp.emitOpError()
            << "has neither source not target memory space as L3.";
   }
 
   Value memref = bindingsMapper.lookup(logicalObjFifo.getMemref());
 
-  auto objFifo = dyn_cast<AIE::ObjectFifoCreateOp>(
-      mapper.lookup(connectionOp.getOperation()));
-
-  uint32_t bdId = bdIdOp.getValue();
-
-  if (!objFifo) {
-    return dmaOp.emitError()
-           << "input isn't mapped to an `aie.objectifo` operation";
+  if (memOps.size() != 1) {
+    return dmaOp.emitOpError() << "only a single connection op source expected";
+  }
+  auto shimDmaAllocOp = dyn_cast<AIE::ShimDMAAllocationOp>(memOps[0]);
+  if (!shimDmaAllocOp) {
+    return dmaOp.emitOpError() << "expected the source of the connection to "
+                                  "be mapped to a `AIE::ShimDMAAllocationOp`";
   }
 
   if (!offsets.empty() || !sizes.empty() || !strides.empty()) {
@@ -607,41 +426,52 @@ LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter,
               "aiex.npu.dma_memcpy_nd.";
   }
 
+  uint32_t bdId = bdIdOp.getValue();
   bool issueToken = dmaOp.hasDmaWaitOpUser();
 
   rewriter.setInsertionPoint(dmaOp);
   rewriter.create<AIEX::NpuDmaMemcpyNdOp>(
       dmaOp.getLoc(), SmallVector<Type, 1>{}, 0, 0, memref, offsets, sizes,
       strides, staticOffsets, staticSizes, staticStrides, nullptr,
-      objFifo.getName(), bdId, issueToken);
+      shimDmaAllocOp.getSymName(), bdId, issueToken);
 
   toBeErased.push_back(dmaOp);
   return success();
 }
 
 /// Convert the `amdaie.npu.dma_wait` operation to `aiex.npu.dma_wait`.
-LogicalResult npuDmaWaitToAIE(IRRewriter &rewriter, AMDAIE::NpuDmaWaitOp waitOp,
-                              SmallVector<Operation *> &toBeErased,
-                              IRMapping &mapper, IRMapping &bindingsMapper) {
+LogicalResult AIEDeviceBuilder::npuDmaWaitToAIE(
+    AMDAIE::NpuDmaWaitOp waitOp, SmallVector<Operation *> &toBeErased) {
+  LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::NpuDmaWaitOp]\n");
   rewriter.setInsertionPoint(waitOp);
   AMDAIE::ConnectionOp connectionOp = waitOp.getDmaOp().getConnectionOp();
-  auto objFifo = dyn_cast<xilinx::AIE::ObjectFifoCreateOp>(
-      mapper.lookup(connectionOp.getOperation()));
-  if (!objFifo) {
-    return waitOp.emitError()
-           << "input isn't mapped to an `aie.objectifo` operation";
+  if (!connectionToSourceTargetMemOps.contains(connectionOp)) {
+    return connectionOp.emitOpError()
+           << "should be found in the connection to source/target mem ops map";
+  }
+  SmallVector<Operation *> memOps =
+      waitOp.getDirection() == AMDAIE::DMAChannelDir::MM2S
+          ? connectionToSourceTargetMemOps[connectionOp].first
+          : connectionToSourceTargetMemOps[connectionOp].second;
+  if (memOps.size() != 1) {
+    return waitOp.emitOpError()
+           << "only a single connection op source expected";
+  }
+  auto shimDmaAllocOp = dyn_cast<AIE::ShimDMAAllocationOp>(memOps[0]);
+  if (!shimDmaAllocOp) {
+    return waitOp.emitOpError() << "expected the source of the connection to "
+                                   "be mapped to a `AIE::ShimDMAAllocationOp`";
   }
   rewriter.create<AIEX::NpuDmaWaitOp>(rewriter.getUnknownLoc(),
-                                      objFifo.getName());
+                                      shimDmaAllocOp.getSymName());
   toBeErased.push_back(waitOp);
   return success();
 }
 
 /// Insert the control code operations into the NPU instruction function.
-LogicalResult controlCodeToAie(IRRewriter &rewriter,
-                               AMDAIE::ControlCodeOp controlCodeOp,
-                               xilinx::AIEX::RuntimeSequenceOp funcOp,
-                               IRMapping &mapper, IRMapping &bindingsMapper) {
+LogicalResult AIEDeviceBuilder::controlCodeToAIE(
+    AMDAIE::ControlCodeOp controlCodeOp,
+    xilinx::AIEX::RuntimeSequenceOp funcOp) {
   LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::ControlCodeOp]\n");
   Block *funcBlock = &funcOp.getBody().front();
   rewriter.setInsertionPointToEnd(funcBlock);
@@ -667,23 +497,21 @@ LogicalResult controlCodeToAie(IRRewriter &rewriter,
                   // TODO(jornt): This is temporarily handled already by
                   // combining with `ConnectionOp` to create `aie.objectfifo`
                   // until we get rid of those.
-                  eraseOp(rewriter, mapper, dmaOp);
+                  eraseOp(dmaOp);
                   return success();
                 })
                 .Case<AMDAIE::NpuDmaCpyNdOp>([&](auto dmaOp) {
-                  return npuDmaCpyNdOpToAIE(rewriter, dmaOp, toBeErased, mapper,
-                                            bindingsMapper);
+                  return npuDmaCpyNdOpToAIE(dmaOp, toBeErased);
                 })
                 .Case<AMDAIE::NpuDmaWaitOp>([&](auto waitOp) {
-                  return npuDmaWaitToAIE(rewriter, waitOp, toBeErased, mapper,
-                                         bindingsMapper);
+                  return npuDmaWaitToAIE(waitOp, toBeErased);
                 })
                 .Case<AMDAIE::EndOp>([&](auto endOp) {
-                  eraseOp(rewriter, mapper, endOp);
+                  eraseOp(endOp);
                   return success();
                 })
                 .Default([&](Operation *op) {
-                  remapOperands(op, mapper);
+                  remapOperands(op);
                   return success();
                 })
                 .failed()) {
@@ -692,55 +520,339 @@ LogicalResult controlCodeToAie(IRRewriter &rewriter,
         return WalkResult::advance();
       });
   if (res.wasInterrupted()) return failure();
-  for (Operation *op : toBeErased) eraseOp(rewriter, mapper, op);
+  for (Operation *op : toBeErased) eraseOp(op);
   return success();
 }
 
 //===----------------------------------------------------------------------===//
-// Convert amdaie.logicalobjectfifo.link operation to `aie.objectfifo.link`
+// Convert ops in Workgroup to AIE ops
 //===----------------------------------------------------------------------===//
 
-LogicalResult linkToAIE(IRRewriter &rewriter,
-                        AMDAIE::LogicalObjectFifoLink linkOp, IRMapping &mapper,
-                        Block *deviceBlock) {
-  LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::LogicalObjectFifoLink]\n");
+/// Convert `amdaie.buffer` to `aie.buffer`.
+LogicalResult AIEDeviceBuilder::bufferToAIE(AMDAIE::BufferOp bufferOp,
+                                            Block *deviceBlock, int &bufferId) {
+  LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::BufferOp]\n");
   OpBuilder::InsertionGuard guard(rewriter);
   rewriter.setInsertionPointToEnd(deviceBlock);
-  SmallVector<Attribute> inSyms;
-  for (auto in : linkOp.getIns()) {
-    auto objFifo = dyn_cast<xilinx::AIE::ObjectFifoCreateOp>(
-        mapper.lookup(in.getDefiningOp()));
-    if (!objFifo) {
-      return linkOp.emitError()
-             << "input isn't mapped to an `aie.objectifo` operation";
+  auto elemType = cast<MemRefType>(bufferOp.getType());
+  Value tile = mapper.lookup(bufferOp.getTile());
+  auto aieBufferOp = rewriter.create<AIE::BufferOp>(
+      bufferOp.getLoc(), elemType, tile,
+      rewriter.getStringAttr("buff_" + std::to_string(bufferId++)),
+      /*address*/ bufferOp.getAddressAttr(),
+      /*mem_bank*/ nullptr);
+  mapper.map(bufferOp.getResult(), aieBufferOp.getResult());
+  mapper.map(bufferOp.getOperation(), aieBufferOp.getOperation());
+  return success();
+}
+
+/// Convert the `amdaie.connection` operation into `aie.flow` ops and DMA
+/// operations. Depending on the location of the source/target of the
+/// connection, different DMA ops are created:
+/// 1. Source/target on a Shim tile: iterate through producer/consumer channels
+/// and create corresponding `aie.shim_dma_allocation` ops.
+/// 2. Source/target on MemTile: iterate through producer/consumer channels,
+/// lookup the correct `aie.memtile_dma` op and create new DMA BD blocks inside.
+/// 3. Source/target on MemTile: iterate through producer/consumer channels,
+/// lookup the correct `aie.mem` op and create new DMA BD blocks inside.
+LogicalResult AIEDeviceBuilder::connectionToAIE(
+    AMDAIE::ConnectionOp connectionOp, Block *deviceBlock,
+    int &connectionIndex) {
+  LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::ConnectionOp]\n");
+  rewriter.setInsertionPointToEnd(deviceBlock);
+  SmallVector<AMDAIE::ChannelOp> producerChannels;
+  SmallVector<AMDAIE::ChannelOp> consumerChannels;
+  for (Value producerChannel : connectionOp.getSourceChannels()) {
+    auto channelOp =
+        dyn_cast<AMDAIE::ChannelOp>(producerChannel.getDefiningOp());
+    if (!channelOp) {
+      return connectionOp.emitOpError()
+             << "found non-`amdaie.channel` source channel";
     }
-    inSyms.push_back(
-        SymbolRefAttr::get(rewriter.getContext(), objFifo.getSymName()));
+    producerChannels.push_back(channelOp);
   }
-  SmallVector<Attribute> outSyms;
-  for (auto out : linkOp.getOuts()) {
-    auto objFifo = dyn_cast<xilinx::AIE::ObjectFifoCreateOp>(
-        mapper.lookup(out.getDefiningOp()));
-    if (!objFifo) {
-      return linkOp.emitError()
-             << "output isn't mapped to an `aie.objectifo` operation";
+  for (Value consumerChannel : connectionOp.getTargetChannels()) {
+    auto channelOp =
+        dyn_cast<AMDAIE::ChannelOp>(consumerChannel.getDefiningOp());
+    if (!channelOp) {
+      return connectionOp.emitOpError()
+             << "found non-`amdaie.channel` target channel";
     }
-    outSyms.push_back(
-        SymbolRefAttr::get(rewriter.getContext(), objFifo.getSymName()));
+    consumerChannels.push_back(channelOp);
+  }
+  // Insert flow ops.
+  rewriter.setInsertionPointToEnd(deviceBlock);
+  for (AMDAIE::ChannelOp producerChannel : producerChannels) {
+    for (AMDAIE::ChannelOp consumerChannel : consumerChannels) {
+      Value aieProducerTile = mapper.lookup(producerChannel.getTile());
+      Value aieConsumerTile = mapper.lookup(consumerChannel.getTile());
+      rewriter.create<AIE::FlowOp>(
+          rewriter.getUnknownLoc(), aieProducerTile, AIE::WireBundle::DMA,
+          producerChannel.getValue(), aieConsumerTile, AIE::WireBundle::DMA,
+          consumerChannel.getValue());
+    }
+  }
+
+  FailureOr<AMDAIE::NpuCircularDmaCpyNdOp> maybeNpuDmaUserOp =
+      connectionOp.getNpuCircularDmaCpyNdUser();
+  if (failed(maybeNpuDmaUserOp))
+    return connectionOp.emitOpError() << "has no circular NPU DMA op user";
+
+  SmallVector<Operation *> sourceMemOps;
+  Value source = connectionOp.getSource();
+  auto sourceObjFifoLikeOp =
+      dyn_cast_if_present<AMDAIE::LogicalObjFifoOpInterface>(
+          source.getDefiningOp());
+  if (!sourceObjFifoLikeOp) {
+    return connectionOp.emitOpError()
+           << "expected source to be an logical objFifo-like op";
+  }
+  if (sourceObjFifoLikeOp.getMemorySpaceAsUInt() == 0) {
+    for (AMDAIE::ChannelOp channel : producerChannels) {
+      AIE::ShimDMAAllocationOp shimDmaAllocOp = createShimDmaAllocation(
+          deviceBlock, channel.getTileOp(), AIE::DMAChannelDir::MM2S,
+          channel.getValue(), sourceObjFifoLikeOp.getMemrefType(),
+          connectionIndex);
+      sourceMemOps.push_back(shimDmaAllocOp.getOperation());
+    }
+  } else {
+    auto sourceObjFifo =
+        dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromBuffersOp>(
+            source.getDefiningOp());
+    if (!sourceObjFifo) {
+      return connectionOp.emitOpError()
+             << "expected source to be an "
+                "`amdaie.logicalobjectfifo.from_buffers` op";
+    }
+    std::optional<size_t> maybeSize = maybeNpuDmaUserOp->getSourceStaticSize();
+    if (!maybeSize) {
+      return maybeNpuDmaUserOp->emitOpError()
+             << "could not compute a static access size for source";
+    }
+    std::optional<size_t> maybeOffset =
+        maybeNpuDmaUserOp->getSourceStaticBaseOffset();
+    if (!maybeOffset) {
+      return maybeNpuDmaUserOp->emitOpError()
+             << "could not compute a static base offset for source";
+    }
+    AIE::BDDimLayoutArrayAttr dims = convertSizeStrideToBDDimLayoutArrayAttr(
+        maybeNpuDmaUserOp->getSourceMixedSizes(),
+        maybeNpuDmaUserOp->getSourceMixedStrides());
+    SmallVector<CopyOpInterface> objFifoProducers =
+        sourceObjFifo.getCopyLikeProducers();
+    SmallVector<CopyOpInterface> objFifoConsumers =
+        sourceObjFifo.getCopyLikeConsumers();
+    // Default acquire/release value is 1. Will be adjusted depending on number
+    // of producers/consumers.
+    int acqNum{1};
+    if (objFifoConsumers.size() < objFifoProducers.size()) {
+      assert(objFifoProducers.size() % objFifoConsumers.size() == 0);
+      acqNum = objFifoProducers.size() / objFifoConsumers.size();
+    }
+    for (AMDAIE::ChannelOp channel : producerChannels) {
+      Operation *memOp = tileToMemOpMap.at(channel.getTile());
+      AMDAIE::TileOp tileOp = channel.getTileOp();
+      SmallVector<AIE::BufferOp> buffers = llvm::map_to_vector(
+          sourceObjFifo.getBuffersOnTile(tileOp),
+          [&](AMDAIE::BufferOp bufferOp) {
+            return cast<AIE::BufferOp>(mapper.lookup(bufferOp.getOperation()));
+          });
+      SmallVector<AIE::LockOp> producerLocks = llvm::map_to_vector(
+          sourceObjFifo.getProducerLocksOnTile(tileOp),
+          [&](AMDAIE::LockOp lockOp) {
+            return cast<AIE::LockOp>(mapper.lookup(lockOp.getOperation()));
+          });
+      SmallVector<AIE::LockOp> consumerLocks = llvm::map_to_vector(
+          sourceObjFifo.getConsumerLocksOnTile(tileOp),
+          [&](AMDAIE::LockOp lockOp) {
+            return cast<AIE::LockOp>(mapper.lookup(lockOp.getOperation()));
+          });
+      if (producerLocks.size() != 1) {
+        return sourceObjFifo.emitOpError()
+               << "expected a single producer lock for tile: "
+               << channel.getTile() << ", channel: " << channel.getResult();
+      }
+      if (consumerLocks.size() != 1) {
+        return sourceObjFifo.emitOpError()
+               << "expected a single consumer lock for tile: "
+               << channel.getTile() << ", channel: " << channel.getResult();
+      }
+      std::pair<AIE::LockOp, AIE::LockOp> lockPair =
+          std::make_pair(consumerLocks[0], producerLocks[0]);
+      rewriter.moveOpBefore(memOp, deviceBlock, deviceBlock->end());
+      createDMA(memOp, AIE::DMAChannelDir::MM2S, channel.getValue(), dims,
+                acqNum, acqNum, maybeSize.value(), maybeOffset.value(), buffers,
+                lockPair);
+    }
+  }
+
+  SmallVector<Operation *> targetMemOps;
+  Value target = connectionOp.getTarget();
+  auto targetObjFifoLikeOp =
+      dyn_cast_if_present<AMDAIE::LogicalObjFifoOpInterface>(
+          target.getDefiningOp());
+  if (!targetObjFifoLikeOp) {
+    return connectionOp.emitOpError()
+           << "expected target to be an logical objFifo-like op";
+  }
+  if (targetObjFifoLikeOp.getMemorySpaceAsUInt() == 0) {
+    for (AMDAIE::ChannelOp channel : consumerChannels) {
+      AIE::ShimDMAAllocationOp shimDmaAllocOp = createShimDmaAllocation(
+          deviceBlock, channel.getTileOp(), AIE::DMAChannelDir::S2MM,
+          channel.getValue(), targetObjFifoLikeOp.getMemrefType(),
+          connectionIndex);
+      targetMemOps.push_back(shimDmaAllocOp.getOperation());
+    }
+  } else {
+    auto targetObjFifo =
+        dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromBuffersOp>(
+            target.getDefiningOp());
+    if (!targetObjFifo) {
+      return connectionOp.emitOpError()
+             << "expected target to be an "
+                "`amdaie.logicalobjectfifo.from_buffers` op";
+    }
+    std::optional<size_t> maybeSize = maybeNpuDmaUserOp->getTargetStaticSize();
+    if (!maybeSize) {
+      return maybeNpuDmaUserOp->emitOpError()
+             << "could not compute a static access size for source";
+    }
+    std::optional<size_t> maybeOffset =
+        maybeNpuDmaUserOp->getTargetStaticBaseOffset();
+    if (!maybeOffset) {
+      return maybeNpuDmaUserOp->emitOpError()
+             << "could not compute a static base offset for source";
+    }
+    AIE::BDDimLayoutArrayAttr dims = convertSizeStrideToBDDimLayoutArrayAttr(
+        maybeNpuDmaUserOp->getTargetMixedSizes(),
+        maybeNpuDmaUserOp->getTargetMixedStrides());
+    SmallVector<CopyOpInterface> objFifoProducers =
+        targetObjFifo.getCopyLikeProducers();
+    SmallVector<CopyOpInterface> objFifoConsumers =
+        targetObjFifo.getCopyLikeConsumers();
+    // Default acquire/release value is 1. Will be adjusted depending on number
+    // of producers/consumers.
+    int acqNum{1};
+    if (objFifoProducers.size() < objFifoConsumers.size()) {
+      assert(objFifoConsumers.size() % objFifoProducers.size() == 0);
+      acqNum = objFifoConsumers.size() / objFifoProducers.size();
+    }
+    for (AMDAIE::ChannelOp channel : consumerChannels) {
+      Operation *memOp = tileToMemOpMap.at(channel.getTile());
+      AMDAIE::TileOp tileOp = channel.getTileOp();
+      SmallVector<AIE::BufferOp> buffers = llvm::map_to_vector(
+          targetObjFifo.getBuffersOnTile(tileOp),
+          [&](AMDAIE::BufferOp bufferOp) {
+            return cast<AIE::BufferOp>(mapper.lookup(bufferOp.getOperation()));
+          });
+      SmallVector<AIE::LockOp> producerLocks = llvm::map_to_vector(
+          targetObjFifo.getProducerLocksOnTile(tileOp),
+          [&](AMDAIE::LockOp lockOp) {
+            return cast<AIE::LockOp>(mapper.lookup(lockOp.getOperation()));
+          });
+      SmallVector<AIE::LockOp> consumerLocks = llvm::map_to_vector(
+          targetObjFifo.getConsumerLocksOnTile(tileOp),
+          [&](AMDAIE::LockOp lockOp) {
+            return cast<AIE::LockOp>(mapper.lookup(lockOp.getOperation()));
+          });
+      if (producerLocks.size() != 1) {
+        return targetObjFifo.emitOpError()
+               << "expected a single producer lock for tile: "
+               << channel.getTile();
+      }
+      if (consumerLocks.size() != 1) {
+        return targetObjFifo.emitOpError()
+               << "expected a single consumer lock for tile: "
+               << channel.getTile();
+      }
+      std::pair<AIE::LockOp, AIE::LockOp> lockPair =
+          std::make_pair(producerLocks[0], consumerLocks[0]);
+      rewriter.moveOpBefore(memOp, deviceBlock, deviceBlock->end());
+      createDMA(memOp, AIE::DMAChannelDir::S2MM, channel.getValue(), dims,
+                acqNum, acqNum, maybeSize.value(), maybeOffset.value(), buffers,
+                lockPair);
+    }
+  }
+
+  // Keep track of source/target mem ops for this connection for later retrieval
+  // to create NPU ops.
+  connectionToSourceTargetMemOps[connectionOp] =
+      std::make_pair(sourceMemOps, targetMemOps);
+  return success();
+}
+
+LogicalResult AIEDeviceBuilder::lockToAIE(AMDAIE::LockOp lockOp,
+                                          Block *deviceBlock, int &lockIndex) {
+  LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::LockOp]\n");
+  OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPointToEnd(deviceBlock);
+  Value tile = mapper.lookup(lockOp.getTile());
+  auto aieLockOp = rewriter.create<AIE::LockOp>(
+      lockOp.getLoc(), tile, lockOp.getValueAttr(), lockOp.getInitValueAttr(),
+      rewriter.getStringAttr("lock_" + std::to_string(lockIndex++)));
+  mapper.map(lockOp.getResult(), aieLockOp.getResult());
+  mapper.map(lockOp.getOperation(), aieLockOp.getOperation());
+  return success();
+}
+
+template <typename MemOp>
+LogicalResult logicalObjFifoFromBuffersToMemOp(
+    IRRewriter &rewriter, AMDAIE::LogicalObjectFifoFromBuffersOp logicalObjFifo,
+    IRMapping &mapper, Block *deviceBlock,
+    DenseMap<Value, Operation *> &tileToMemOpMap) {
+  LLVM_DEBUG(
+      llvm::dbgs() << "Convert [AMDAIE::LogicalObjectFifoFromBuffersOp]\n");
+  OpBuilder::InsertionGuard guard(rewriter);
+  SmallVector<CopyOpInterface> consumers =
+      logicalObjFifo.getCopyLikeConsumers();
+  SmallVector<CopyOpInterface> producers =
+      logicalObjFifo.getCopyLikeProducers();
+  if (producers.size() > 1 && consumers.size() > 1) {
+    return logicalObjFifo.emitOpError()
+           << "has a multi-producer, multi-consumer DMA "
+              "pattern, which is currently not supported";
+  }
+  // Create a memory op for every unique tile and fill it with DMA ops.
+  for (Value tile : logicalObjFifo.getTiles()) {
+    if (tileToMemOpMap.contains(tile)) continue;
+    Value aieTile = mapper.lookup(tile);
+    rewriter.setInsertionPointToEnd(deviceBlock);
+    auto newMemOp = rewriter.create<MemOp>(rewriter.getUnknownLoc(), aieTile);
+    rewriter.setInsertionPointToStart(&newMemOp.getRegion().emplaceBlock());
+    rewriter.create<AIE::EndOp>(rewriter.getUnknownLoc());
+    // Keep track of the MemOps on different tiles.
+    tileToMemOpMap[tile] = newMemOp.getOperation();
+  }
+  return success();
+}
+
+LogicalResult AIEDeviceBuilder::logicalObjFifoFromBuffersToAIE(
+    AMDAIE::LogicalObjectFifoFromBuffersOp logicalObjFifo, Block *deviceBlock) {
+  LLVM_DEBUG(
+      llvm::dbgs() << "Convert [AMDAIE::LogicalObjectFifoFromBuffersOp]\n");
+  uint8_t memSpaceUInt = logicalObjFifo.getMemorySpaceAsUInt();
+  if (memSpaceUInt == 1) {
+    // L2
+    return logicalObjFifoFromBuffersToMemOp<AIE::MemTileDMAOp>(
+        rewriter, logicalObjFifo, mapper, deviceBlock, tileToMemOpMap);
+  } else if (memSpaceUInt == 2) {
+    // L1
+    return logicalObjFifoFromBuffersToMemOp<AIE::MemOp>(
+        rewriter, logicalObjFifo, mapper, deviceBlock, tileToMemOpMap);
+  } else {
+    return logicalObjFifo.emitOpError()
+           << "has unsupported memory space for lowering to AIE: "
+           << std::to_string(memSpaceUInt);
   }
-  rewriter.create<AIE::ObjectFifoLinkOp>(
-      rewriter.getUnknownLoc(), rewriter.getArrayAttr(inSyms),
-      rewriter.getArrayAttr(outSyms), rewriter.getArrayAttr({}),
-      rewriter.getArrayAttr({}));
   return success();
 }
 
 //===----------------------------------------------------------------------===//
-// Convert amdaie.tile operation to aie.tile
+// Convert `amdaie.tile` operation to `aie.tile`
 //===----------------------------------------------------------------------===//
 
-LogicalResult tileToAIE(IRRewriter &rewriter, AMDAIE::TileOp tileOp,
-                        IRMapping &mapper, Block *deviceBlock) {
+LogicalResult AIEDeviceBuilder::tileToAIE(AMDAIE::TileOp tileOp,
+                                          Block *deviceBlock) {
   LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::TileOp]\n");
   OpBuilder::InsertionGuard guard(rewriter);
   int64_t col = getConstantIntValue(tileOp.getCol()).value();
@@ -757,18 +869,18 @@ LogicalResult tileToAIE(IRRewriter &rewriter, AMDAIE::TileOp tileOp,
 // Convert amdaie.workgroup operation and insert into aie.device
 //===----------------------------------------------------------------------===//
 
-LogicalResult workgroupToAIE(IRRewriter &rewriter,
-                             AMDAIE::WorkgroupOp workgroupOp,
-                             xilinx::AIE::DeviceOp deviceOp,
-                             xilinx::AIEX::RuntimeSequenceOp npuFuncOp,
-                             IRMapping &mapper, IRMapping &bindingsMapper) {
+LogicalResult AIEDeviceBuilder::workgroupToAIE(
+    AMDAIE::WorkgroupOp workgroupOp, xilinx::AIE::DeviceOp deviceOp,
+    xilinx::AIEX::RuntimeSequenceOp npuFuncOp) {
   OpBuilder::InsertionGuard guard(rewriter);
   Block *deviceBlock = &deviceOp.getRegion().front();
   Block *deviceCoreBlock = rewriter.createBlock(&deviceOp.getRegion());
   rewriter.setInsertionPoint(deviceBlock, deviceBlock->begin());
 
   // Walk all operations in the AIE region and convert to AIE ops
-  int dmaId = 0;
+  int bufferId{0};
+  int lockId{0};
+  int connectionIndex{0};
   WalkResult res = workgroupOp.walk<WalkOrder::PreOrder>([&](Operation *op) {
     return TypeSwitch<Operation *, WalkResult>(op)
         .Case<AMDAIE::BdIdOp>([&](auto bdIdOp) {
@@ -776,49 +888,78 @@ LogicalResult workgroupToAIE(IRRewriter &rewriter,
           // so don't convert to AIE dialect.
           return WalkResult::advance();
         })
+        .Case<AMDAIE::BufferOp>([&](auto bufferOp) {
+          if (failed(bufferToAIE(bufferOp, deviceBlock, bufferId))) {
+            return WalkResult::interrupt();
+          }
+          return WalkResult::advance();
+        })
+        .Case<AMDAIE::ChannelOp>([&](auto channelOp) {
+          // Channel ops are purely used for retrieving information in other ops
+          // so don't convert to AIE dialect.
+          return WalkResult::advance();
+        })
         .Case<AMDAIE::CircularDmaCpyNdOp>([&](auto dmaOp) {
           dmaOp.emitOpError()
               << "`amdaie.circular_dma_cpy_nd` unsupported in lowering to AIE";
           return WalkResult::interrupt();
         })
         .Case<AMDAIE::ConnectionOp>([&](auto dmaOp) {
-          if (failed(flowToAIE(rewriter, dmaOp, mapper, deviceBlock, dmaId))) {
+          if (failed(connectionToAIE(dmaOp, deviceBlock, connectionIndex))) {
             return WalkResult::interrupt();
           }
           return WalkResult::advance();
         })
         .Case<AMDAIE::ControlCodeOp>([&](auto controlCodeOp) {
-          if (failed(controlCodeToAie(rewriter, controlCodeOp, npuFuncOp,
-                                      mapper, bindingsMapper))) {
+          if (failed(controlCodeToAIE(controlCodeOp, npuFuncOp))) {
             controlCodeOp.emitError("could not convert to AIEDialect ops");
             return WalkResult::interrupt();
           }
           return WalkResult::skip();
         })
         .Case<AMDAIE::CoreOp>([&](auto coreOp) {
-          if (failed(coreToAIE(rewriter, coreOp, mapper, deviceOp,
-                               deviceCoreBlock))) {
+          if (failed(coreToAIE(coreOp, deviceOp, deviceCoreBlock))) {
             coreOp.emitError("could not convert to AIEDialect ops");
             return WalkResult::interrupt();
           }
           return WalkResult::skip();
         })
-        .Case<AMDAIE::LogicalObjectFifoLink>([&](auto linkOp) {
-          if (failed(linkToAIE(rewriter, linkOp, mapper, deviceBlock))) {
+        .Case<AMDAIE::LockOp>([&](auto lockOp) {
+          if (failed(lockToAIE(lockOp, deviceBlock, lockId))) {
+            return WalkResult::interrupt();
+          }
+          return WalkResult::advance();
+        })
+        .Case<AMDAIE::LogicalObjectFifoFromBuffersOp>([&](auto logicalObjFifo) {
+          if (failed(logicalObjFifoFromBuffersToAIE(logicalObjFifo,
+                                                    deviceBlock))) {
             return WalkResult::interrupt();
           }
           return WalkResult::advance();
         })
+        .Case<AMDAIE::LogicalObjectFifoPlaceholderOp>([&](auto logicalObjFifo) {
+          // Skip placeholder ops as they don't have an equivalent in the
+          // AIE dialect and shim dma allocations are created from
+          // connections directly currently.
+          return WalkResult::advance();
+        })
         .Case<AMDAIE::TileOp>([&](auto tileOp) {
-          if (failed(tileToAIE(rewriter, tileOp, mapper, deviceBlock))) {
+          if (failed(tileToAIE(tileOp, deviceBlock))) {
             return WalkResult::interrupt();
           }
           return WalkResult::advance();
         })
+        .Case<AMDAIE::WorkgroupOp>([&](auto workgroupOp) {
+          // Skip workgroup ops themselves.
+          return WalkResult::advance();
+        })
         .Default([&](Operation *op) {
           rewriter.setInsertionPointToEnd(deviceBlock);
           if (!isa_and_present<AMDAIEDialect>(op->getDialect())) {
             rewriter.clone(*op, mapper);
+          } else {
+            op->emitOpError() << "is unsupported in lowering to AIE dialect";
+            return WalkResult::interrupt();
           }
           return WalkResult::advance();
         });
@@ -838,8 +979,7 @@ LogicalResult workgroupToAIE(IRRewriter &rewriter,
 /// `AIE::DeviceOp` into the module for every encountered `FuncOp`, and then
 /// traverse the function build the AIE device operation and convert all AMDAIE
 /// dialect operations to AIE dialect operations.
-LogicalResult lowerToAIE(ModuleOp moduleOp) {
-  IRRewriter rewriter(moduleOp.getContext());
+LogicalResult AIEDeviceBuilder::lowerToAIE(ModuleOp moduleOp) {
   Block *moduleBlock = &moduleOp->getRegion(0).front();
 
   // Retrieve the AMDAIEDevice from the executable target attribute.
@@ -868,7 +1008,6 @@ LogicalResult lowerToAIE(ModuleOp moduleOp) {
     // of the aiex.runtime_sequence operation that replaces the
     // amdaie.controlcode. The HAL interface bindings are used to
     // order the function parameters correctly.
-    IRMapping bindingsMapper;
     SmallVector<IREE::HAL::InterfaceBindingSubspanOp> subspanOps;
     funcOp->walk([&](IREE::HAL::InterfaceBindingSubspanOp subspanOp) {
       subspanOps.push_back(subspanOp);
@@ -891,14 +1030,13 @@ LogicalResult lowerToAIE(ModuleOp moduleOp) {
     }
 
     // Walk the AIE regions ops and convert ops into pure AIEDialect ops.
-    IRMapping mapper;
+    // IRMapping mapper;
     rewriter.setInsertionPointToStart(deviceBlock);
     WalkResult res = funcOp.walk<WalkOrder::PreOrder>([&](Operation *op) {
       if (isa<func::FuncOp, func::ReturnOp>(op)) {
         return WalkResult::advance();
       } else if (auto workgroupOp = dyn_cast<AMDAIE::WorkgroupOp>(op)) {
-        if (failed(workgroupToAIE(rewriter, workgroupOp, deviceOp, npuFuncOp,
-                                  mapper, bindingsMapper))) {
+        if (failed(workgroupToAIE(workgroupOp, deviceOp, npuFuncOp))) {
           return WalkResult::interrupt();
         }
         return WalkResult::skip();
@@ -915,7 +1053,7 @@ LogicalResult lowerToAIE(ModuleOp moduleOp) {
     rewriter.moveOpBefore(npuFuncOp, deviceBlock, deviceBlock->end());
     // After walking the FuncOp, it has been converted into a DeviceOp and can
     // safely be erased.
-    eraseOp(rewriter, mapper, funcOp);
+    eraseOp(funcOp);
     return WalkResult::advance();
   });
   if (funcRes.wasInterrupted()) return failure();
@@ -950,14 +1088,16 @@ class AMDAIELowerToAIEPass
     : public impl::AMDAIELowerToAIEBase<AMDAIELowerToAIEPass> {
  public:
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<AMDAIEDialect, xilinx::AIE::AIEDialect,
-                    xilinx::AIEX::AIEXDialect>();
+    registry.insert<mlir::memref::MemRefDialect, AMDAIEDialect,
+                    xilinx::AIE::AIEDialect, xilinx::AIEX::AIEXDialect>();
   }
 
   void runOnOperation() override {
     // Main function call to convert all operations into AIE dialect
     // operations inside an AIE device.
-    if (failed(lowerToAIE(getOperation()))) return signalPassFailure();
+    ModuleOp moduleOp = getOperation();
+    AIEDeviceBuilder builder(moduleOp.getContext());
+    if (failed(builder.lowerToAIE(moduleOp))) return signalPassFailure();
   }
 };
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h
new file mode 100644
index 000000000..871d0961f
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h
@@ -0,0 +1,132 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements lowering from the AMDAIE dialect to AIE and AIEX
+// dialects.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef IREE_AMD_AIE_TRANSFORMS_AMDAIELOWERTOAIE_H_
+#define IREE_AMD_AIE_TRANSFORMS_AMDAIELOWERTOAIE_H_
+
+#include "aie/AIEDialect.h"
+#include "aie/AIEXDialect.h"
+#include "iree-amd-aie/IR/AMDAIEDialect.h"
+#include "iree-amd-aie/IR/AMDAIEOps.h"
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/PatternMatch.h"
+
+using namespace xilinx;
+
+namespace mlir::iree_compiler::AMDAIE {
+
+/// Class to build an `aie.device` from a `module` containing
+/// `amdaie.workgroup`.
+class AIEDeviceBuilder {
+ public:
+  AIEDeviceBuilder(MLIRContext *ctx) : rewriter(ctx) {}
+
+  LogicalResult lowerToAIE(ModuleOp moduleOp);
+
+ private:
+  /// Core op conversion methods.
+  LogicalResult coreMemrefExtractStridedMetadataToAIE(
+      memref::ExtractStridedMetadataOp extractStridedMetadataOp,
+      SmallVector<Operation *> &toBeErased);
+  LogicalResult coreFuncCallOpToAIE(func::CallOp oldCallOp,
+                                    SmallVector<Operation *> &toBeErased);
+  LogicalResult coreUseLockToAIE(AMDAIE::UseLockOp useLockOp,
+                                 SmallVector<Operation *> &toBeErased);
+  LogicalResult coreToAIE(AMDAIE::CoreOp coreOp, AIE::DeviceOp deviceOp,
+                          Block *deviceCoreBlock);
+
+  /// Controlcode ops conversion methods.
+  LogicalResult npuDmaCpyNdOpToAIE(AMDAIE::NpuDmaCpyNdOp dmaOp,
+                                   SmallVector<Operation *> &toBeErased);
+  LogicalResult npuDmaWaitToAIE(AMDAIE::NpuDmaWaitOp waitOp,
+                                SmallVector<Operation *> &toBeErased);
+  LogicalResult controlCodeToAIE(AMDAIE::ControlCodeOp controlCodeOp,
+                                 xilinx::AIEX::RuntimeSequenceOp funcOp);
+
+  /// Workgroup ops conversion methods.
+  LogicalResult bufferToAIE(AMDAIE::BufferOp bufferOp, Block *deviceBlock,
+                            int &bufferId);
+  LogicalResult connectionToAIE(AMDAIE::ConnectionOp connectionOp,
+                                Block *deviceBlock, int &connectionIndex);
+  LogicalResult lockToAIE(AMDAIE::LockOp lockOp, Block *deviceBlock,
+                          int &lockIndex);
+  LogicalResult logicalObjFifoFromBuffersToAIE(
+      AMDAIE::LogicalObjectFifoFromBuffersOp logicalObjFifo,
+      Block *deviceBlock);
+  LogicalResult tileToAIE(AMDAIE::TileOp tileOp, Block *deviceBlock);
+  LogicalResult workgroupToAIE(AMDAIE::WorkgroupOp workgroupOp,
+                               xilinx::AIE::DeviceOp deviceOp,
+                               xilinx::AIEX::RuntimeSequenceOp npuFuncOp);
+
+  /// Utilities
+
+  /// Utility to convert vectors of `size` and `stride` into an
+  /// `AIE::BDDimLayoutArrayAttr`.
+  AIE::BDDimLayoutArrayAttr convertSizeStrideToBDDimLayoutArrayAttr(
+      const SmallVector<OpFoldResult> &sizes,
+      const SmallVector<OpFoldResult> &strides);
+
+  /// Utility to create DMA blocks and add them to `memOp`.
+  void createDMA(Operation *memOp, AIE::DMAChannelDir channelDir,
+                 int channelIndex, AIE::BDDimLayoutArrayAttr dims,
+                 size_t acqNum, size_t relNum, int64_t len, int64_t offset,
+                 const SmallVector<AIE::BufferOp> &bufferOps,
+                 const std::pair<AIE::LockOp, AIE::LockOp> &locks);
+
+  /// Utility to create `aie.shim_dma_allocation` ops and corresponding global
+  /// symbols.
+  AIE::ShimDMAAllocationOp createShimDmaAllocation(
+      Block *deviceBlock, AMDAIE::TileOp tileOp,
+      AIE::DMAChannelDir dmaChannelDir, uint8_t channel, MemRefType memrefType,
+      int &connectionIndex);
+
+  /// It is dangerous to erase ops with `rewriter` without erasing them from
+  /// `mapper` too, as addresses of Operations/Values can be reused, resulting
+  /// in unexpected key-value pairs in `mapper`. Use this utility if `mapper`
+  /// might be used after `op` is erased.
+  void eraseOp(Operation *op);
+
+  /// Utility to fold linear dims, unit dims and single dims in the provided
+  /// `offsets`, `sizes` and `strides` access patterns.
+  void foldDims(const SmallVector<OpFoldResult> &offsets,
+                const SmallVector<OpFoldResult> &sizes,
+                const SmallVector<OpFoldResult> &strides,
+                SmallVector<OpFoldResult> &newOffsets,
+                SmallVector<OpFoldResult> &newSizes,
+                SmallVector<OpFoldResult> &newStrides);
+
+  /// Utility to remap the provided operation's operands.
+  void remapOperands(Operation *op);
+
+  /// Members
+
+  IRRewriter rewriter;
+  IRMapping mapper;
+  /// Dedicated mapper for the HAL bindings.
+  IRMapping bindingsMapper;
+  /// Map from tile values to AIE memory op (`aie.mem` or `aie.memtile_dma`).
+  /// This is used to look up and add new DMA patterns to those memory ops.
+  DenseMap<Value, Operation *> tileToMemOpMap;
+  /// Map from connections to source and target AIE memory ops (`aie.mem` or
+  /// `aie.memtile_dma`, or `aie.shim_dma_allocation`). This is mainly used for
+  /// looking up the global symbols from `aie.shim_dma_allocation` ops needed
+  /// to create AIEX NPU ops.
+  DenseMap<AMDAIE::ConnectionOp,
+           std::pair<SmallVector<Operation *>, SmallVector<Operation *>>>
+      connectionToSourceTargetMemOps;
+};
+
+}  // namespace mlir::iree_compiler::AMDAIE
+
+#endif  // IREE_AMD_AIE_TRANSFORMS_AMDAIELOWERTOAIE_H_
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
index 002a9bcec..7fdad4b25 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
@@ -39,12 +39,14 @@ iree_cc_library(
     "Passes.h"
     "AMDAIECreateAIEWorkgroup.h"
     "AMDAIEDmaUtils.h"
+    "AMDAIELowerToAIE.h"
     "AMDAIEOpUtils.h"
     "AMDAIEUtils.h"
     "Transforms.h"
   SRCS
     "AMDAIEAccessToAcquireRelease.cpp"
     "AMDAIEAddLoweringStrategy.cpp"
+    "AMDAIEAcquireReleaseToUseLock.cpp"
     "AMDAIEAssignChannels.cpp"
     "AMDAIEAssignLogicalObjectFifoDepth.cpp"
     "AMDAIEAssignNpuDmaBdIds.cpp"
@@ -54,7 +56,6 @@ iree_cc_library(
     "AMDAIECombineStridedOps.cpp"
     "AMDAIEControlCodeLoopUnroll.cpp"
     "AMDAIEConvertCoreForallToFor.cpp"
-    "AMDAIECoreLoopUnroll.cpp"
     "AMDAIECreateAIEWorkgroup.cpp"
     "AMDAIECreateLogicalObjectFifoLink.cpp"
     "AMDAIECreateReferenceToAllocation.cpp"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
index 1e0ba9bfa..06172300d 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
@@ -21,6 +21,7 @@ namespace mlir::iree_compiler::AMDAIE {
 
 #define GEN_PASS_DECL
 #define GEN_PASS_DEF_AMDAIEACCESSTOACQUIRERELEASE
+#define GEN_PASS_DEF_AMDAIEACQUIRERELEASETOUSELOCK
 #define GEN_PASS_DEF_AMDAIEASSIGNCHANNELS
 #define GEN_PASS_DEF_AMDAIEASSIGNLOGICALOBJECTFIFODEPTH
 #define GEN_PASS_DEF_AMDAIEASSIGNNPUDMABDIDS
@@ -32,7 +33,6 @@ namespace mlir::iree_compiler::AMDAIE {
 #define GEN_PASS_DEF_AMDAIECOMBINESTRIDEDOPS
 #define GEN_PASS_DEF_AMDAIECONTROLCODELOOPUNROLL
 #define GEN_PASS_DEF_AMDAIECONVERTCOREFORALLTOFOR
-#define GEN_PASS_DEF_AMDAIECORELOOPUNROLL
 #define GEN_PASS_DEF_AMDAIECREATEAIEWORKGROUP
 #define GEN_PASS_DEF_AMDAIECREATELOGICALOBJECTFIFOLINK
 #define GEN_PASS_DEF_AMDAIECREATEREFERENCETOALLOCATION
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
index c1309c9b9..e2797cb6a 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -142,6 +142,7 @@ static void addAMDAIEBufferizePasses(OpPassManager &pm) {
 }
 
 void addAMDAIEToAIEPasses(OpPassManager &passManager) {
+  passManager.addPass(createAMDAIEAcquireReleaseToUseLockPass());
   passManager.addPass(createAMDAIECanonicalizeNpuDmaCpyNdPass());
   passManager.addPass(createCanonicalizerPass());
   passManager.addPass(createAMDAIESinkIntoCorePass());
@@ -625,18 +626,19 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) {
   passManager.addPass(createCanonicalizerPass());
   passManager.addPass(createAMDAIEDmaCSEPass());
 
-  passManager.addPass(createAMDAIECreateLogicalObjectFifoLinkPass());
+  // passManager.addPass(createAMDAIECreateLogicalObjectFifoLinkPass());
   passManager.addPass(createAMDAIECanonicalizeDoublyStridedOpPass());
   passManager.addPass(createCanonicalizerPass());
 
   passManager.addPass(createAMDAIEConvertCoreForallToForPass());
   passManager.addPass(createCanonicalizerPass());
-  passManager.addPass(createAMDAIECoreLoopUnrollPass());
 
   passManager.addPass(createAMDAIEAssignChannelsPass());
   passManager.addPass(createCSEPass());
   passManager.addPass(createCanonicalizerPass());
 
+  passManager.addPass(createAMDAIEObjFifoBufferizationPass());
+
   addAMDAIEToAIEPasses(passManager);
 
   // Now lower using the AIE passes from MLIR-AIE.
@@ -819,10 +821,8 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device) {
 void addMLIRAIELoweringPasses(OpPassManager &passManager) {
   {
     OpPassManager &devicePM = passManager.nest<xilinx::AIE::DeviceOp>();
-    devicePM.addPass(createAMDAIEObjectFifoStatefulTransformPass());
     devicePM.addPass(createCanonicalizerPass());
     devicePM.addPass(createAMDAIEDmaToNpuPass());
-    devicePM.addPass(createAMDAIEAssignLockIDsPass());
     devicePM.addPass(createAMDAIEAssignBufferDescriptorIDsPass());
     devicePM.addPass(createAMDAIEAssignBufferAddressesBasicPass());
     devicePM.addPass(createAMDAIEPathfinderPass());
@@ -839,7 +839,6 @@ void addMLIRAIELoweringPasses(OpPassManager &passManager) {
     devicePM.addPass(createAMDAIENormalizeAddressSpacesPass());
     devicePM.addPass(createCanonicalizerPass());
   }
-
 }
 
 // NOTE: this runs on the top-level program module containing all hal.executable
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
index c3867d009..8039fe5a2 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
@@ -53,6 +53,10 @@ void buildAMDAIELinkingPassPipeline(OpPassManager &passManager);
 /// semaphore operations.
 std::unique_ptr<Pass> createAMDAIEAccessToAcquireReleasePass();
 
+/// Create a pass to convert logical objectFifo acquire/release ops to
+/// `amdaie.use_lock`
+std::unique_ptr<Pass> createAMDAIEAcquireReleaseToUseLockPass();
+
 /// Create a pass to assign channels to connections.
 std::unique_ptr<Pass> createAMDAIEAssignChannelsPass();
 
@@ -106,10 +110,6 @@ std::unique_ptr<Pass> createAMDAIECleanupPass();
 /// are compatible.
 std::unique_ptr<Pass> createAMDAIECombineStridedOpsPass();
 
-/// Create a pass to unroll `scf.for` with synchronization ops based on
-/// objectFifo buffer depths.
-std::unique_ptr<Pass> createAMDAIECoreLoopUnrollPass();
-
 /// Create a pass decomposing iree_linalg_ext.pack and unpack ops to AIR
 /// dialect.
 std::unique_ptr<Pass> createAMDAIEDecomposeLinalgExtPackUnPackToAIRPass();
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
index 339ed9651..f1d7f54d4 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
@@ -17,6 +17,12 @@ def AMDAIEAccessToAcquireRelease :
   let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEAccessToAcquireReleasePass()";
 }
 
+def AMDAIEAcquireReleaseToUseLock :
+    Pass<"iree-amdaie-acquire-release-to-use-lock", ""> {
+  let summary = "Convert acquire/release synchronization stubs to `amdaie.use_lock`";
+  let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEAcquireReleaseToUseLockPass()";
+}
+
 def AMDAIEAssignChannels :
     Pass<"iree-amdaie-assign-channels", ""> {
   let summary = "Assign channels to `amdaie.connection` ops.";
@@ -128,13 +134,6 @@ def AMDAIEConvertCoreForallToFor :
   let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEConvertCoreForallToForPass()";
 }
 
-def AMDAIECoreLoopUnroll :
-    Pass<"iree-amdaie-core-loop-unroll", ""> {
-  let summary = "Within core ops, unroll `scf.for` with synchronization ops based on "
-                "objectFifo buffer depths.";
-  let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIECoreLoopUnrollPass()";
-}
-
 def AMDAIECreateAIEWorkgroup :
   Pass<"iree-amdaie-create-aie-workgroup", "func::FuncOp"> {
   let summary = "Creates a single AIE workgroup.";
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
index 61071df29..8a86c6e82 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
@@ -10,6 +10,7 @@ iree_lit_test_suite(
   SRCS
     "access_to_acquire_release.mlir"
     "aie_link_executables.mlir"
+    "acquire_release_to_use_lock.mlir"
     "assign_channels.mlir"
     "assign_logical_objectfifo_depth.mlir"
     "assign_npu_dma_bd_ids.mlir"
@@ -20,7 +21,6 @@ iree_lit_test_suite(
     "combine_strided_ops.mlir"
     "controlcode_loop_unrolling.mlir"
     "convert_core_forall_to_for.mlir"
-    "core_loop_unroll.mlir"
     "create_aie_workgroup.mlir"
     "create_logical_objectfifo_link.mlir"
     "create_reference_to_allocation.mlir"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/acquire_release_to_use_lock.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/acquire_release_to_use_lock.mlir
new file mode 100644
index 000000000..7da636291
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/acquire_release_to_use_lock.mlir
@@ -0,0 +1,214 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-amdaie-acquire-release-to-use-lock,canonicalize,cse))" --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: @depth_1
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:   %[[TILE_0_2:.+]] = amdaie.tile(%[[C0]], %[[C2]])
+// CHECK:       %[[BUFFER:.+]] = amdaie.buffer(%[[TILE_0_2]]) : memref<1024xi32, 2 : i32>
+// CHECK:       %[[LOCK:.+]] = amdaie.lock(%[[TILE_0_2]](0))
+// CHECK:       %[[LOCK_1:.+]] = amdaie.lock(%[[TILE_0_2]](1))
+// CHECK:       amdaie.core
+// CHECK:         scf.for %{{.+}} = %[[C0]] to %[[C4]] step %[[C1]] {
+// CHECK:           amdaie.use_lock(%[[LOCK]], AcquireGreaterOrEqual(1))
+// CHECK:           memref.reinterpret_cast %[[BUFFER]]
+// CHECK:           linalg.fill
+// CHECK:           amdaie.use_lock(%[[LOCK_1]], Release(1))
+// CHECK:         }
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @depth_1() {
+    amdaie.workgroup {
+      %c0_i32 = arith.constant 0 : i32
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c2 = arith.constant 2 : index
+      %c4 = arith.constant 4 : index
+      %tile = amdaie.tile(%c0, %c1)
+      %tile_0 = amdaie.tile(%c0, %c2)
+      %buffer = amdaie.buffer(%tile) : memref<1024xi32, 1 : i32>
+      %lock = amdaie.lock(%tile(0))
+      %lock_2 = amdaie.lock(%tile(1))
+      %buffer_1 = amdaie.buffer(%tile_0) : memref<1024xi32, 2 : i32>
+      %lock_5 = amdaie.lock(%tile_0(0))
+      %lock_6 = amdaie.lock(%tile_0(1))
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_2}) : memref<1024xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1 : i32>, 1>
+      %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_5}, {%lock_6}) : memref<1024xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2 : i32>, 1>
+      %2 = amdaie.connection(%0, %1) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1 : i32>, 1>, !amdaie.logicalobjectfifo<memref<1024xi32, 2 : i32>, 1>)
+      %3 = amdaie.core(%tile_0, in : [], out : [%2]) {
+        scf.for %arg0 = %c0 to %c4 step %c1 {
+          %4 = amdaie.logicalobjectfifo.acquire(%2, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<1024xi32, 2 : i32>>
+          %5 = amdaie.logicalobjectfifo.access(%4, Write) : !amdaie.logicalobjectfifo<memref<1024xi32, 2 : i32>> -> memref<1024xi32, 2 : i32>
+          %reinterpret_cast = memref.reinterpret_cast %5 to offset: [0], sizes: [32, 32], strides: [32, 1] : memref<1024xi32, 2 : i32> to memref<32x32xi32, 2 : i32>
+          linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast : memref<32x32xi32, 2 : i32>)
+          amdaie.logicalobjectfifo.release(%2, Produce) {size = 1 : i32}
+        }
+        amdaie.end
+      }
+      amdaie.controlcode {
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @depth_2
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:   %[[TILE_0_2:.+]] = amdaie.tile(%[[C0]], %[[C2]])
+// CHECK:       %[[BUFFER:.+]] = amdaie.buffer(%[[TILE_0_2]]) : memref<1024xi32, 2 : i32>
+// CHECK:       %[[BUFFER_1:.+]] = amdaie.buffer(%[[TILE_0_2]]) : memref<1024xi32, 2 : i32>
+// CHECK:       %[[LOCK:.+]] = amdaie.lock(%[[TILE_0_2]](0))
+// CHECK:       %[[LOCK_1:.+]] = amdaie.lock(%[[TILE_0_2]](1))
+// CHECK:       amdaie.core
+// CHECK:         scf.for %{{.+}} = %[[C0]] to %[[C4]] step %[[C2]] {
+// CHECK:           amdaie.use_lock(%[[LOCK]], AcquireGreaterOrEqual(1))
+// CHECK:           memref.reinterpret_cast %[[BUFFER]]
+// CHECK:           linalg.fill
+// CHECK:           amdaie.use_lock(%[[LOCK_1]], Release(1))
+// CHECK:           amdaie.use_lock(%[[LOCK]], AcquireGreaterOrEqual(1))
+// CHECK:           memref.reinterpret_cast %[[BUFFER_1]]
+// CHECK:           amdaie.use_lock(%[[LOCK_1]], Release(1))
+// CHECK:         }
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @depth_2() {
+    amdaie.workgroup {
+      %c0_i32 = arith.constant 0 : i32
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c2 = arith.constant 2 : index
+      %c4 = arith.constant 4 : index
+      %tile = amdaie.tile(%c0, %c1)
+      %tile_0 = amdaie.tile(%c0, %c2)
+      %buffer = amdaie.buffer(%tile) : memref<1024xi32, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile) : memref<1024xi32, 1 : i32>
+      %lock = amdaie.lock(%tile(0))
+      %lock_2 = amdaie.lock(%tile(1))
+      %buffer_3 = amdaie.buffer(%tile_0) : memref<1024xi32, 2 : i32>
+      %buffer_4 = amdaie.buffer(%tile_0) : memref<1024xi32, 2 : i32>
+      %lock_5 = amdaie.lock(%tile_0(0))
+      %lock_6 = amdaie.lock(%tile_0(1))
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<1024xi32, 1 : i32>, memref<1024xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1 : i32>, 2>
+      %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_3, %buffer_4}, {%lock_5}, {%lock_6}) : memref<1024xi32, 2 : i32>, memref<1024xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2 : i32>, 2>
+      %2 = amdaie.connection(%0, %1) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<1024xi32, 2 : i32>, 2>)
+      %3 = amdaie.core(%tile_0, in : [], out : [%2]) {
+        scf.for %arg0 = %c0 to %c4 step %c1 {
+          %4 = amdaie.logicalobjectfifo.acquire(%2, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<1024xi32, 2 : i32>>
+          %5 = amdaie.logicalobjectfifo.access(%4, Write) : !amdaie.logicalobjectfifo<memref<1024xi32, 2 : i32>> -> memref<1024xi32, 2 : i32>
+          %reinterpret_cast = memref.reinterpret_cast %5 to offset: [0], sizes: [32, 32], strides: [32, 1] : memref<1024xi32, 2 : i32> to memref<32x32xi32, 2 : i32>
+          linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast : memref<32x32xi32, 2 : i32>)
+          amdaie.logicalobjectfifo.release(%2, Produce) {size = 1 : i32}
+        }
+        amdaie.end
+      }
+      amdaie.controlcode {
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @depth_4
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C21:.+]] = arith.constant 21 : index
+// CHECK-DAG:   %[[TILE_0_2:.+]] = amdaie.tile(%[[C0]], %[[C2]])
+// CHECK:       %[[BUFFER:.+]] = amdaie.buffer(%[[TILE_0_2]]) : memref<1024xi32, 2 : i32>
+// CHECK:       %[[BUFFER_1:.+]] = amdaie.buffer(%[[TILE_0_2]]) : memref<1024xi32, 2 : i32>
+// CHECK:       %[[BUFFER_2:.+]] = amdaie.buffer(%[[TILE_0_2]]) : memref<1024xi32, 2 : i32>
+// CHECK:       %[[BUFFER_3:.+]] = amdaie.buffer(%[[TILE_0_2]]) : memref<1024xi32, 2 : i32>
+// CHECK:       %[[LOCK:.+]] = amdaie.lock(%[[TILE_0_2]](0))
+// CHECK:       %[[LOCK_1:.+]] = amdaie.lock(%[[TILE_0_2]](1))
+// CHECK:       amdaie.core
+// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:     %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:     %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:     %[[C17:.+]] = arith.constant 17 : index
+// CHECK:         scf.for %[[ARG0:.+]] = %[[C1]] to %[[C17]] step %[[C8]] {
+// CHECK:           amdaie.use_lock(%[[LOCK]], AcquireGreaterOrEqual(1))
+// CHECK:           memref.reinterpret_cast %[[BUFFER]]
+// CHECK:           index_cast
+// CHECK:           linalg.fill
+// CHECK:           amdaie.use_lock(%[[LOCK_1]], Release(1)
+// CHECK:           arith.addi %[[ARG0]], %[[C2]] : index
+// CHECK:           amdaie.use_lock(%[[LOCK]], AcquireGreaterOrEqual(1))
+// CHECK:           memref.reinterpret_cast %[[BUFFER_1]]
+// CHECK:           index_cast
+// CHECK:           linalg.fill
+// CHECK:           amdaie.use_lock(%[[LOCK_1]], Release(1)
+// CHECK:           arith.addi %[[ARG0]], %[[C4]] : index
+// CHECK:           amdaie.use_lock(%[[LOCK]], AcquireGreaterOrEqual(1))
+// CHECK:           memref.reinterpret_cast %[[BUFFER_2]]
+// CHECK:           index_cast
+// CHECK:           linalg.fill
+// CHECK:           amdaie.use_lock(%[[LOCK_1]], Release(1)
+// CHECK:           arith.addi %[[ARG0]], %[[C6]] : index
+// CHECK:           amdaie.use_lock(%[[LOCK]], AcquireGreaterOrEqual(1))
+// CHECK:           memref.reinterpret_cast %[[BUFFER_3]]
+// CHECK:           index_cast
+// CHECK:           linalg.fill
+// CHECK:           amdaie.use_lock(%[[LOCK_1]], Release(1)
+// CHECK:         }
+// CHECK:         scf.for %[[ARG1:.+]] = %[[C17]] to %[[C21]] step %[[C2]] {
+// CHECK:           amdaie.use_lock(%[[LOCK]], AcquireGreaterOrEqual(1))
+// CHECK:           memref.reinterpret_cast %[[BUFFER]]
+// CHECK:           index_cast %[[ARG1]]
+// CHECK:           linalg.fill
+// CHECK:           amdaie.use_lock(%[[LOCK_1]], Release(1)
+// CHECK:         }
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @depth_4() {
+    amdaie.workgroup {
+      %c0_i32 = arith.constant 0 : i32
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c2 = arith.constant 2 : index
+      %c4 = arith.constant 4 : index
+      %c21 = arith.constant 21 : index
+      %tile = amdaie.tile(%c0, %c1)
+      %tile_0 = amdaie.tile(%c0, %c2)
+      %buffer = amdaie.buffer(%tile) : memref<1024xi32, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile) : memref<1024xi32, 1 : i32>
+      %buffer_2 = amdaie.buffer(%tile) : memref<1024xi32, 1 : i32>
+      %buffer_3 = amdaie.buffer(%tile) : memref<1024xi32, 1 : i32>
+      %lock = amdaie.lock(%tile(0))
+      %lock_2 = amdaie.lock(%tile(1))
+      %buffer_4 = amdaie.buffer(%tile_0) : memref<1024xi32, 2 : i32>
+      %buffer_5 = amdaie.buffer(%tile_0) : memref<1024xi32, 2 : i32>
+      %buffer_6 = amdaie.buffer(%tile_0) : memref<1024xi32, 2 : i32>
+      %buffer_7 = amdaie.buffer(%tile_0) : memref<1024xi32, 2 : i32>
+      %lock_5 = amdaie.lock(%tile_0(0))
+      %lock_6 = amdaie.lock(%tile_0(1))
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1, %buffer_2, %buffer_3}, {%lock}, {%lock_2}) : memref<1024xi32, 1 : i32>, memref<1024xi32, 1 : i32>, memref<1024xi32, 1 : i32>, memref<1024xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1 : i32>, 4>
+      %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_4, %buffer_5, %buffer_6, %buffer_7}, {%lock_5}, {%lock_6}) : memref<1024xi32, 2 : i32>, memref<1024xi32, 2 : i32>, memref<1024xi32, 2 : i32>, memref<1024xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2 : i32>, 4>
+      %2 = amdaie.connection(%0, %1) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1 : i32>, 4>, !amdaie.logicalobjectfifo<memref<1024xi32, 2 : i32>, 4>)
+      %3 = amdaie.core(%tile_0, in : [], out : [%2]) {
+        scf.for %arg0 = %c1 to %c21 step %c2 {
+          %4 = amdaie.logicalobjectfifo.acquire(%2, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<1024xi32, 2 : i32>>
+          %5 = amdaie.logicalobjectfifo.access(%4, Write) : !amdaie.logicalobjectfifo<memref<1024xi32, 2 : i32>> -> memref<1024xi32, 2 : i32>
+          %6 = memref.reinterpret_cast %5 to offset: [0], sizes: [32, 32], strides: [32, 1] : memref<1024xi32, 2 : i32> to memref<32x32xi32, 2 : i32>
+          %c = arith.index_cast %arg0 : index to i32
+          linalg.fill ins(%c : i32) outs(%6 : memref<32x32xi32, 2 : i32>)
+          amdaie.logicalobjectfifo.release(%2, Produce) {size = 1 : i32}
+        }
+        amdaie.end
+      }
+      amdaie.controlcode {
+        amdaie.end
+      }
+    }
+    return
+  }
+}
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/core_loop_unroll.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/core_loop_unroll.mlir
deleted file mode 100644
index 4bb7ccd22..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/core_loop_unroll.mlir
+++ /dev/null
@@ -1,181 +0,0 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-amdaie-core-loop-unroll,canonicalize))" --split-input-file %s | FileCheck %s
-
-// No change for depth 1.
-
-// CHECK-LABEL: @depth_1
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
-// CHECK-DAG:   amdaie.core
-// CHECK:         scf.for %{{.+}} = %[[C0]] to %[[C4]] step %[[C1]] {
-// CHECK:           amdaie.logicalobjectfifo.acquire
-// CHECK:           amdaie.logicalobjectfifo.access
-// CHECK:           memref.reinterpret_cast
-// CHECK:           linalg.fill
-// CHECK:         }
-#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
-module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @depth_1() {
-    amdaie.workgroup {
-      %c0_i32 = arith.constant 0 : i32
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 1 : index
-      %c2 = arith.constant 2 : index
-      %c4 = arith.constant 4 : index
-      %tile_0_1 = amdaie.tile(%c0, %c1)
-      %tile_0_2 = amdaie.tile(%c0, %c2)
-      %alloc_1 = memref.alloc() : memref<32x32xi32, 1>
-      %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2>
-      %obj0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
-      %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-      %dma0 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<1024xi32, 2>>)
-      %core_0_0 = amdaie.core(%tile_0_2, in : [], out : [%dma0]) {
-        scf.for %arg0 = %c0 to %c4 step %c1 {
-          %0 = amdaie.logicalobjectfifo.acquire(%dma0, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-          %1 = amdaie.logicalobjectfifo.access(%0, Write) : !amdaie.logicalobjectfifo<memref<1024xi32, 2>> -> memref<1024xi32, 2>
-          %2 = memref.reinterpret_cast %1 to offset: [0], sizes: [32, 32], strides: [32, 1] : memref<1024xi32, 2> to memref<32x32xi32, 2>
-          linalg.fill ins(%c0_i32 : i32) outs(%2 : memref<32x32xi32, 2>)
-        }
-        amdaie.end
-      }
-      memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2>
-      memref.dealloc %alloc_1 : memref<32x32xi32, 1>
-      amdaie.controlcode {
-        amdaie.end
-      }
-    }
-    return
-  }
-}
-
-// -----
-
-// CHECK-LABEL: @depth_2
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
-// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
-// CHECK-DAG:   amdaie.core
-// CHECK:         scf.for %{{.+}} = %[[C0]] to %[[C8]] step %[[C4]] {
-// CHECK:           amdaie.logicalobjectfifo.acquire
-// CHECK:           amdaie.logicalobjectfifo.access
-// CHECK:           memref.reinterpret_cast
-// CHECK:           linalg.fill
-// CHECK:           amdaie.logicalobjectfifo.acquire
-// CHECK:           amdaie.logicalobjectfifo.access
-// CHECK:           memref.reinterpret_cast
-// CHECK:           linalg.fill
-// CHECK:         }
-#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
-module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @depth_2() {
-    amdaie.workgroup {
-      %c0_i32 = arith.constant 0 : i32
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 1 : index
-      %c2 = arith.constant 2 : index
-      %c8 = arith.constant 8 : index
-      %tile_0_1 = amdaie.tile(%c0, %c1)
-      %tile_0_2 = amdaie.tile(%c0, %c2)
-      %alloc_1 = memref.alloc() : memref<32x32xi32, 1>
-      %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2>
-      %obj0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>, 2>
-      %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>, 2>
-      %dma0 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>, 2>, !amdaie.logicalobjectfifo<memref<1024xi32, 2>, 2>)
-      %core_0_0 = amdaie.core(%tile_0_2, in : [], out : [%dma0]) {
-        scf.for %arg0 = %c0 to %c8 step %c2 {
-          %0 = amdaie.logicalobjectfifo.acquire(%dma0, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>, 2>
-          %1 = amdaie.logicalobjectfifo.access(%0, Write) : !amdaie.logicalobjectfifo<memref<1024xi32, 2>, 2> -> memref<1024xi32, 2>
-          %2 = memref.reinterpret_cast %1 to offset: [0], sizes: [32, 32], strides: [32, 1] : memref<1024xi32, 2> to memref<32x32xi32, 2>
-          linalg.fill ins(%c0_i32 : i32) outs(%2 : memref<32x32xi32, 2>)
-        }
-        amdaie.end
-      }
-      memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2>
-      memref.dealloc %alloc_1 : memref<32x32xi32, 1>
-      amdaie.controlcode {
-        amdaie.end
-      }
-    }
-    return
-  }
-}
-
-// -----
-
-// CHECK-LABEL: @depth_4
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
-// CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
-// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
-// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
-// CHECK-DAG:   %[[C17:.+]] = arith.constant 17 : index
-// CHECK-DAG:   %[[C21:.+]] = arith.constant 21 : index
-// CHECK-DAG:   amdaie.core
-// CHECK:         scf.for %[[ARG0:.+]] = %[[C1]] to %[[C17]] step %[[C8]] {
-// CHECK:           amdaie.logicalobjectfifo.acquire
-// CHECK:           amdaie.logicalobjectfifo.access
-// CHECK:           memref.reinterpret_cast
-// CHECK:           index_cast
-// CHECK:           linalg.fill
-// CHECK:           arith.addi %[[ARG0]], %[[C2]] : index
-// CHECK:           amdaie.logicalobjectfifo.acquire
-// CHECK:           amdaie.logicalobjectfifo.access
-// CHECK:           memref.reinterpret_cast
-// CHECK:           index_cast
-// CHECK:           linalg.fill
-// CHECK:           arith.addi %[[ARG0]], %[[C4]] : index
-// CHECK:           amdaie.logicalobjectfifo.acquire
-// CHECK:           amdaie.logicalobjectfifo.access
-// CHECK:           memref.reinterpret_cast
-// CHECK:           index_cast
-// CHECK:           linalg.fill
-// CHECK:           arith.addi %[[ARG0]], %[[C6]] : index
-// CHECK:           amdaie.logicalobjectfifo.acquire
-// CHECK:           amdaie.logicalobjectfifo.access
-// CHECK:           memref.reinterpret_cast
-// CHECK:           index_cast
-// CHECK:           linalg.fill
-// CHECK:         }
-// CHECK:         scf.for %[[ARG1:.+]] = %[[C17]] to %[[C21]] step %[[C2]] {
-// CHECK:           amdaie.logicalobjectfifo.acquire
-// CHECK:           amdaie.logicalobjectfifo.access
-// CHECK:           memref.reinterpret_cast
-// CHECK:           index_cast %[[ARG1]]
-// CHECK:           linalg.fill
-// CHECK:         }
-#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
-module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @depth_4() {
-    amdaie.workgroup {
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 1 : index
-      %c2 = arith.constant 2 : index
-      %c4 = arith.constant 4 : index
-      %c16 = arith.constant 21 : index
-      %tile_0_1 = amdaie.tile(%c0, %c1)
-      %tile_0_2 = amdaie.tile(%c0, %c2)
-      %alloc_1 = memref.alloc() : memref<32x32xi32, 1>
-      %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2>
-      %obj0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>, 4>
-      %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>, 4>
-      %dma0 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>, 4>, !amdaie.logicalobjectfifo<memref<1024xi32, 2>, 4>)
-      %core_0_0 = amdaie.core(%tile_0_2, in : [], out : [%dma0]) {
-        scf.for %arg0 = %c1 to %c16 step %c2 {
-          %0 = amdaie.logicalobjectfifo.acquire(%dma0, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>, 4>
-          %1 = amdaie.logicalobjectfifo.access(%0, Write) : !amdaie.logicalobjectfifo<memref<1024xi32, 2>, 4> -> memref<1024xi32, 2>
-          %2 = memref.reinterpret_cast %1 to offset: [0], sizes: [32, 32], strides: [32, 1] : memref<1024xi32, 2> to memref<32x32xi32, 2>
-          %c = arith.index_cast %arg0 : index to i32
-          linalg.fill ins(%c : i32) outs(%2 : memref<32x32xi32, 2>)
-        }
-        amdaie.end
-      }
-      memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2>
-      memref.dealloc %alloc_1 : memref<32x32xi32, 1>
-      amdaie.controlcode {
-        amdaie.end
-      }
-    }
-    return
-  }
-}
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir
index 45cbb3506..49e52c6e1 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir
@@ -25,6 +25,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
+//===----------------------------------------------------------------------===//
+// Workgroup tests
+//===----------------------------------------------------------------------===//
+
 // CHECK: module
 // CHECK: aie.device
 // CHECK: aiex.runtime_sequence @workgroup
@@ -68,23 +72,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// NOTE: Due to an AIE check that verifies whether aie.objectfifo is linked correctly,
-// this test checks two `amdaie.connection` operations, so they can be linked
-// correctly.
-//
-// CHECK:       aie.device
-// CHECK-DAG:   %[[TILE_0_2:.+]] = aie.tile(0, 2)
-// CHECK-DAG:   %[[TILE_0_1:.+]] = aie.tile(0, 1)
-// CHECK-DAG:   %[[TILE_0_0:.+]] = aie.tile(0, 0)
-// CHECK:       aie.objectfifo @[[OBJ0:.+]](%[[TILE_0_2]], {%[[TILE_0_1]]}
-// CHECK-NEXT:  aie.objectfifo @[[OBJ1:.+]](%[[TILE_0_1]], {%[[TILE_0_0]]}
-// CHECK-NEXT:  aie.objectfifo.link
-// CHECK-SAME:  @[[OBJ0]]
-// CHECK-SAME:  @[[OBJ1]]
-// CHECK:       aiex.runtime_sequence @connections_and_link
+// CHECK:     module
+// CHECK:     aie.device
+// CHECK-DAG: aie.tile(0, 2)
+// CHECK-DAG: aie.tile(0, 1)
+// CHECK-DAG: aie.tile(0, 0)
+// CHECK:     aiex.runtime_sequence @tile
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @connections_and_link() {
+  func.func @tile() {
     amdaie.workgroup {
       %c0 = arith.constant 0 : index
       %c1 = arith.constant 1 : index
@@ -92,21 +88,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %tile_0_0 = amdaie.tile(%c0, %c0)
       %tile_0_1 = amdaie.tile(%c0, %c1)
       %tile_0_2 = amdaie.tile(%c0, %c2)
-      %alloc_0 = memref.alloc() : memref<32x64xi32>
-      %alloc_1 = memref.alloc() : memref<32x32xi32, 1>
-      %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2>
-      %obj0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
-      %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
-      %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-      %conn0 = amdaie.connection(%obj1, %obj2) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<1024xi32, 2>>)
-      %conn1 = amdaie.connection(%obj0, %obj1) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
-      amdaie.logicalobjectfifo.link[%conn0] -> [%conn1] ()
-      memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2>
-      memref.dealloc %alloc_1 : memref<32x32xi32, 1>
-      memref.dealloc %alloc_0 : memref<32x64xi32>
       amdaie.controlcode {
-        %npu_dma_0 = amdaie.npu.circular_dma_cpy_nd %conn0([] [] [], [] [] [])
-        %npu_dma_1 = amdaie.npu.circular_dma_cpy_nd %conn1([] [] [], [] [] [])
         amdaie.end
       }
     }
@@ -116,49 +98,25 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// NOTE: Due to an AIE check that verifies whether aie.objectfifo is linked correctly,
-// this test checks two `amdaie.connection` operations, so they can be linked
-// correctly.
-//
-// CHECK:       aie.device
-// CHECK-DAG:   %[[TILE_0_2:.+]] = aie.tile(0, 2)
-// CHECK-DAG:   %[[TILE_0_1:.+]] = aie.tile(0, 1)
-// CHECK-DAG:   %[[TILE_0_0:.+]] = aie.tile(0, 0)
-// CHECK:       aie.objectfifo @[[OBJ0:.+]](%[[TILE_0_2]] toStream [<size = 32, stride = 8>, <size = 4, stride = 256>, <size = 8, stride = 1>]
-// CHECK-NEXT:  aie.objectfifo @[[OBJ1:.+]](%[[TILE_0_1]], {%[[TILE_0_0]]}
-// CHECK-NEXT:  aie.objectfifo.link
-// CHECK-SAME:  @[[OBJ0]]
-// CHECK-SAME:  @[[OBJ1]]
-// CHECK:       aiex.runtime_sequence @circular_dma_cpy_sizes_and_strides
+// CHECK:     module
+// CHECK:     aie.device
+// CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2)
+// CHECK-DAG: %[[TILE_0_1:.+]] = aie.tile(0, 1)
+// CHECK-DAG: aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_0"} : memref<4096xi32, 1 : i32>
+// CHECK-DAG: aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_1"} : memref<4096xi32, 2 : i32>
+// CHECK:     aiex.runtime_sequence @buffer
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @circular_dma_cpy_sizes_and_strides() {
+  func.func @buffer() {
     amdaie.workgroup {
       %c0 = arith.constant 0 : index
       %c1 = arith.constant 1 : index
       %c2 = arith.constant 2 : index
-      %c4 = arith.constant 4 : index
-      %c8 = arith.constant 8 : index
-      %c32 = arith.constant 32 : index
-      %c256 = arith.constant 256 : index
-      %tile_0_0 = amdaie.tile(%c0, %c0)
       %tile_0_1 = amdaie.tile(%c0, %c1)
       %tile_0_2 = amdaie.tile(%c0, %c2)
-      %alloc_0 = memref.alloc() : memref<32x64xi32>
-      %alloc_1 = memref.alloc() : memref<32x32xi32, 1>
-      %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2>
-      %obj0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
-      %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
-      %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-      %conn0 = amdaie.connection(%obj1, %obj2) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<1024xi32, 2>>)
-      %conn1 = amdaie.connection(%obj0, %obj1) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
-      amdaie.logicalobjectfifo.link[%conn0] -> [%conn1] ()
-      memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2>
-      memref.dealloc %alloc_1 : memref<32x32xi32, 1>
-      memref.dealloc %alloc_0 : memref<32x64xi32>
+      %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile_0_2) : memref<4096xi32, 2 : i32>
       amdaie.controlcode {
-        %npu_dma_0 = amdaie.npu.circular_dma_cpy_nd %conn0([] [] [], [%c0, %c0, %c0] [%c32, %c4, %c8] [%c8, %c256, %c1])
-        %npu_dma_1 = amdaie.npu.circular_dma_cpy_nd %conn1([] [] [], [] [] [])
         amdaie.end
       }
     }
@@ -168,52 +126,25 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// NOTE: Due to an AIE check that verifies whether AIE operations exist inside a
-// core, it's hard to create a very small minimal test.
-//
-// CHECK:       aie.device
-// CHECK-DAG:   %[[TILE_0_2:.+]] = aie.tile(0, 2)
-// CHECK-DAG:   %{{.+}} = aie.tile(0, 1)
-// CHECK-DAG:   %{{.+}} = aie.tile(0, 0)
-// CHECK:       aie.core(%[[TILE_0_2]])
-// CHECK:         %[[ACQUIRE:.+]] = aie.objectfifo.acquire
-// CHECK-SAME:    Produce
-// CHECK:         %[[ACCESS:.+]] = aie.objectfifo.subview.access %[[ACQUIRE]]
-// CHECK:         %[[REINTERPRET:.+]] = memref.reinterpret_cast %[[ACCESS]]
-// CHECK:         linalg.fill ins(%{{.+}} : i32) outs(%[[REINTERPRET]] : memref<32x32xi32, 1>)
-// CHECK:       aiex.runtime_sequence @tile_and_core_and_acquire
+// CHECK:     module
+// CHECK:     aie.device
+// CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2)
+// CHECK-DAG: %[[TILE_0_1:.+]] = aie.tile(0, 1)
+// CHECK-DAG: aie.lock(%[[TILE_0_1]], 4) {init = 8 : i8, sym_name = "lock_0"}
+// CHECK-DAG: aie.lock(%[[TILE_0_2]], 5) {init = 0 : i8, sym_name = "lock_1"}
+// CHECK:     aiex.runtime_sequence @lock
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @tile_and_core_and_acquire() {
+  func.func @lock() {
     amdaie.workgroup {
-      %c0_i32 = arith.constant 0 : i32
       %c0 = arith.constant 0 : index
       %c1 = arith.constant 1 : index
       %c2 = arith.constant 2 : index
-      %tile_0_0 = amdaie.tile(%c0, %c0)
       %tile_0_1 = amdaie.tile(%c0, %c1)
       %tile_0_2 = amdaie.tile(%c0, %c2)
-      %alloc_0 = memref.alloc() : memref<32x64xi32>
-      %alloc_1 = memref.alloc() : memref<32x32xi32, 1>
-      %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2>
-      %obj0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
-      %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
-      %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-      %conn0 = amdaie.connection(%obj1, %obj2) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<1024xi32, 2>>)
-      %conn1 = amdaie.connection(%obj0, %obj1) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
-      %core_0_0 = amdaie.core(%tile_0_2, in : [], out : [%conn0]) {
-        %0 = amdaie.logicalobjectfifo.acquire(%conn0, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
-        %1 = amdaie.logicalobjectfifo.access(%0, Write) : !amdaie.logicalobjectfifo<memref<1024xi32, 1>> -> memref<1024xi32, 1>
-        %2 = memref.reinterpret_cast %1 to offset: [0], sizes: [32, 32], strides: [32, 1] : memref<1024xi32, 1> to memref<32x32xi32, 1>
-        linalg.fill ins(%c0_i32 : i32) outs(%2 : memref<32x32xi32, 1>)
-        amdaie.end
-      }
-      memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2>
-      memref.dealloc %alloc_1 : memref<32x32xi32, 1>
-      memref.dealloc %alloc_0 : memref<32x64xi32>
+      %lock = amdaie.lock(%tile_0_1(4), 8)
+      %lock_1 = amdaie.lock(%tile_0_2(5), 0)
       amdaie.controlcode {
-        %npu_dma_0 = amdaie.npu.circular_dma_cpy_nd %conn0([] [] [], [] [] [])
-        %npu_dma_1 = amdaie.npu.circular_dma_cpy_nd %conn1([] [] [], [] [] [])
         amdaie.end
       }
     }
@@ -223,75 +154,59 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// CHECK:       aie.device
-// CHECK-DAG:   func.func private @ukernel_A(memref<i32, 2>, index) attributes {llvm.bareptr = true}
-// CHECK-DAG:   func.func private @ukernel_B(memref<i32, 2>, index, memref<f32, 2>, index) attributes {llvm.bareptr = true}
-// CHECK-DAG:   %[[TILE_0_2:.+]] = aie.tile(0, 2)
-// CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-// CHECK:       aie.core(%[[TILE_0_2]])
-// CHECK:         %[[ACQUIRE:.+]] = aie.objectfifo.acquire
-// CHECK-SAME:    Produce
-// CHECK:         %[[ACCESS:.+]] = aie.objectfifo.subview.access %[[ACQUIRE]]
-// CHECK:         %[[REINTERPRET:.+]] = memref.reinterpret_cast %[[ACCESS]]
-// CHECK:         %[[ACQUIRE0:.+]] = aie.objectfifo.acquire
-// CHECK-SAME:    Produce
-// CHECK:         %[[ACCESS0:.+]] = aie.objectfifo.subview.access %[[ACQUIRE0]]
-// CHECK:         %[[REINTERPRET0:.+]] = memref.reinterpret_cast %[[ACCESS0]]
-// CHECK:         linalg.fill ins(%{{.+}} : i32) outs(%[[REINTERPRET]] : memref<32x32xi32, 2>)
-// CHECK:         %[[BASE_BUFFER:.*]], %{{.+}}, %{{.+}}:2, %{{.+}}:2 = memref.extract_strided_metadata %[[REINTERPRET]] :
-// CHECK:         %[[BASE_BUFFER0:.*]], %{{.+}}, %{{.+}}:2, %{{.+}}:2 = memref.extract_strided_metadata %[[REINTERPRET0]] :
-// CHECK:         func.call @ukernel_A(%[[BASE_BUFFER]], %[[C0]]) : (memref<i32, 2>, index) -> ()
-// CHECK:         func.call @ukernel_B(%[[BASE_BUFFER]], %[[C0]], %[[BASE_BUFFER0]], %[[C0]]) : (memref<i32, 2>, index, memref<f32, 2>, index) -> ()
-// CHECK:         aie.end
-// CHECK:       } {link_with = "/path/to/ukernel.o"}
-// CHECK:       aiex.runtime_sequence @lower_to_aie_ukernel
+// CHECK:  aie.device
+// CHECK:    %[[TILE_0_2:.*]] = aie.tile(0, 2)
+// CHECK:    %[[TILE_0_1:.*]] = aie.tile(0, 1)
+// CHECK:    %[[BUFFER_0_1:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_0"} : memref<4096xi32, 1 : i32>
+// CHECK:    %[[LOCK_0_1:.*]] = aie.lock(%[[TILE_0_1]], 0) {init = 1 : i8, sym_name = "lock_0"}
+// CHECK:    %[[LOCK_0_1_0:.*]] = aie.lock(%[[TILE_0_1]], 1) {init = 0 : i8, sym_name = "lock_1"}
+// CHECK:    %[[BUFFER_0_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_1"} : memref<4096xi32, 2 : i32>
+// CHECK:    %[[LOCK_0_2:.*]] = aie.lock(%[[TILE_0_2]], 0) {init = 1 : i8, sym_name = "lock_2"}
+// CHECK:    %[[LOCK_0_2_1:.*]] = aie.lock(%[[TILE_0_2]], 1) {init = 0 : i8, sym_name = "lock_3"}
+// CHECK:    aie.flow(%[[TILE_0_1]], DMA : 0, %[[TILE_0_2]], DMA : 0)
+// CHECK:    %[[MEMTILE_DMA_0_1:.*]] = aie.memtile_dma(%[[TILE_0_1]]) {
+// CHECK:      %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb2)
+// CHECK:    ^bb1:
+// CHECK:      aie.use_lock(%[[LOCK_0_1_0]], AcquireGreaterEqual, 1)
+// CHECK:      aie.dma_bd(%[[BUFFER_0_1]] : memref<4096xi32, 1 : i32>) {dimensions = #aie<bd_dim_layout_array[<size = 32, stride = 64>, <size = 32, stride = 1>]>, len = 1024 : i32, offset = 1024 : i32}
+// CHECK:      aie.use_lock(%[[LOCK_0_1]], Release, 1)
+// CHECK:      aie.next_bd ^bb1
+// CHECK:    ^bb2:
+// CHECK:      aie.end
+// CHECK:    }
+// CHECK:    %[[MEM_0_2:.*]] = aie.mem(%[[TILE_0_2]]) {
+// CHECK:      %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb2)
+// CHECK:    ^bb1:
+// CHECK:      aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1)
+// CHECK:      aie.dma_bd(%[[BUFFER_0_2]] : memref<4096xi32, 2 : i32>) {len = 0 : i32}
+// CHECK:      aie.use_lock(%[[LOCK_0_2_1]], Release, 1)
+// CHECK:      aie.next_bd ^bb1
+// CHECK:    ^bb2:
+// CHECK:      aie.end
+// CHECK:    }
+// CHECK:    aiex.runtime_sequence @single_connection_single_buffer
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func private @ukernel_A(memref<i32, 2>, index) attributes {link_with = "/path/to/ukernel.o", llvm.bareptr = true}
-  func.func private @ukernel_B(memref<i32, 2>, index, memref<f32, 2>, index) attributes {link_with = "/path/to/ukernel.o", llvm.bareptr = true}
-  func.func @lower_to_aie_ukernel() {
+  func.func @single_connection_single_buffer() {
     amdaie.workgroup {
-      %c0_i32 = arith.constant 0 : i32
       %c0 = arith.constant 0 : index
       %c1 = arith.constant 1 : index
       %c2 = arith.constant 2 : index
-      %tile_0_0 = amdaie.tile(%c0, %c0)
       %tile_0_1 = amdaie.tile(%c0, %c1)
       %tile_0_2 = amdaie.tile(%c0, %c2)
-      %alloc_0 = memref.alloc() : memref<32x64xi32>
-      %alloc_1 = memref.alloc() : memref<32x32xi32, 2>
-      %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 1>
-      %alloc_3 = memref.alloc() : memref<32x32xf32, 2>
-      %alloc_4 = memref.alloc() : memref<4x8x4x8xf32, 1>
-      %obj0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
-      %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-      %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
-      %obj3 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_0_1} : memref<32x32xf32, 2> -> !amdaie.logicalobjectfifo<memref<1024xf32, 2>>
-      %obj4 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile_0_2} : memref<4x8x4x8xf32, 1> -> !amdaie.logicalobjectfifo<memref<1024xf32, 1>>
-      %conn0 = amdaie.connection(%obj1, %obj2) : (!amdaie.logicalobjectfifo<memref<1024xi32, 2>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
-      %conn1 = amdaie.connection(%obj3, %obj4) : (!amdaie.logicalobjectfifo<memref<1024xf32, 2>>, !amdaie.logicalobjectfifo<memref<1024xf32, 1>>)
-      %core_0_0 = amdaie.core(%tile_0_2, in : [], out : [%conn0, %conn1]) {
-        %0 = amdaie.logicalobjectfifo.acquire(%conn0, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-        %1 = amdaie.logicalobjectfifo.access(%0, Write) : !amdaie.logicalobjectfifo<memref<1024xi32, 2>> -> memref<1024xi32, 2>
-        %reinterpret_0 = memref.reinterpret_cast %1 to offset: [0], sizes: [32, 32], strides: [32, 1] : memref<1024xi32, 2> to memref<32x32xi32, 2>
-        %2 = amdaie.logicalobjectfifo.acquire(%conn1, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<1024xf32, 2>>
-        %3 = amdaie.logicalobjectfifo.access(%2, Write) : !amdaie.logicalobjectfifo<memref<1024xf32, 2>> -> memref<1024xf32, 2>
-        %reinterpret_1 = memref.reinterpret_cast %3 to offset: [0], sizes: [32, 32], strides: [32, 1] : memref<1024xf32, 2> to memref<32x32xf32, 2>
-        linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_0 : memref<32x32xi32, 2>)
-        %base_buffer, %offset, %sizes:2, %strides:2 = memref.extract_strided_metadata %reinterpret_0 : memref<32x32xi32, 2> -> memref<i32, 2>, index, index, index, index, index
-        %base_buffer0, %offset0, %sizes0:2, %strides0:2 = memref.extract_strided_metadata %reinterpret_1 : memref<32x32xf32, 2> -> memref<f32, 2>, index, index, index, index, index
-        func.call @ukernel_A(%base_buffer, %c0) : (memref<i32, 2>, index) -> ()
-        func.call @ukernel_B(%base_buffer, %c0, %base_buffer0, %c0) : (memref<i32, 2>, index, memref<f32, 2>, index) -> ()
-        amdaie.end
-      } {link_with = "/path/to/ukernel.o"}
-      memref.dealloc %alloc_4 : memref<4x8x4x8xf32, 1>
-      memref.dealloc %alloc_3 : memref<32x32xf32, 2>
-      memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 1>
-      memref.dealloc %alloc_1 : memref<32x32xi32, 2>
-      memref.dealloc %alloc_0 : memref<32x64xi32>
+      %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32>
+      %lock = amdaie.lock(%tile_0_1(0), 1)
+      %lock_1 = amdaie.lock(%tile_0_1(1), 0)
+      %buffer_1 = amdaie.buffer(%tile_0_2) : memref<4096xi32, 2 : i32>
+      %lock_2 = amdaie.lock(%tile_0_2(0), 1)
+      %lock_3 = amdaie.lock(%tile_0_2(1), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>
+      %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 1>
+      %channel = amdaie.channel(%tile_0_1, 0)
+      %channel_1 = amdaie.channel(%tile_0_2, 0)
+      %2 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 1>, !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>)
       amdaie.controlcode {
-        %npu_dma_0 = amdaie.npu.circular_dma_cpy_nd %conn0([] [] [], [] [] [])
-        %npu_dma_1 = amdaie.npu.circular_dma_cpy_nd %conn1([] [] [], [] [] [])
+        %3 = amdaie.npu.circular_dma_cpy_nd %2([] [] [], [0, 1024] [32, 32] [64, 1])
         amdaie.end
       }
     }
@@ -301,59 +216,322 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-
-// NOTE: Due to an AIE check that verifies whether AIE operations exist inside a
-// core, it's hard to create a very small minimal test.
-//
-// CHECK:       aie.device
-// CHECK-DAG:   %[[TILE_1_2:.+]] = aie.tile(1, 2)
-// CHECK-DAG:   %[[TILE_0_2:.+]] = aie.tile(0, 2)
-// CHECK-DAG:   %{{.+}} = aie.tile(0, 1)
-// CHECK-DAG:   %{{.+}} = aie.tile(0, 0)
-// CHECK:       aie.core(%[[TILE_0_2]])
-// CHECK:         %[[ACQUIRE_0:.+]] = aie.objectfifo.acquire
-// CHECK-SAME:    Consume
-// CHECK:         aie.objectfifo.subview.access
-// CHECK-SAME:    %[[ACQUIRE_0]]
-// CHECK:       aie.core(%[[TILE_1_2]])
-// CHECK:         %[[ACQUIRE_1:.+]] = aie.objectfifo.acquire
-// CHECK-SAME:    Consume
-// CHECK:         aie.objectfifo.subview.access
-// CHECK-SAME:    %[[ACQUIRE_1]]
-// CHECK:       aiex.runtime_sequence @tile_and_core_and_acquire_broadcast
+// CHECK:  aie.device(npu1_4col)
+// CHECK:    %[[TILE_0_2:.*]] = aie.tile(0, 2)
+// CHECK:    %[[TILE_0_1:.*]] = aie.tile(0, 1)
+// CHECK:    %[[C0:.*]] = arith.constant 0 : index
+// CHECK:    %[[C1:.*]] = arith.constant 1 : index
+// CHECK:    %[[C2:.*]] = arith.constant 2 : index
+// CHECK:    %[[BUFFER_0_1:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_0"} : memref<4096xi32, 1 : i32>
+// CHECK:    %[[BUFFER_0_1_0:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_1"} : memref<4096xi32, 1 : i32>
+// CHECK:    %[[LOCK_0_1:.*]] = aie.lock(%[[TILE_0_1]], 0) {init = 2 : i8, sym_name = "lock_0"}
+// CHECK:    %[[LOCK_0_1_1:.*]] = aie.lock(%[[TILE_0_1]], 1) {init = 0 : i8, sym_name = "lock_1"}
+// CHECK:    %[[BUFFER_0_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_2"} : memref<4096xi32, 2 : i32>
+// CHECK:    %[[BUFFER_0_2_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_3"} : memref<4096xi32, 2 : i32>
+// CHECK:    %[[LOCK_0_2:.*]] = aie.lock(%[[TILE_0_2]], 0) {init = 2 : i8, sym_name = "lock_2"}
+// CHECK:    %[[LOCK_0_2_3:.*]] = aie.lock(%[[TILE_0_2]], 1) {init = 0 : i8, sym_name = "lock_3"}
+// CHECK:    aie.flow(%[[TILE_0_1]], DMA : 0, %[[TILE_0_2]], DMA : 0)
+// CHECK:    %[[MEMTILE_DMA_0_1:.*]] = aie.memtile_dma(%[[TILE_0_1]]) {
+// CHECK:      %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+// CHECK:    ^bb1:
+// CHECK:      aie.use_lock(%[[LOCK_0_1_1]], AcquireGreaterEqual, 1)
+// CHECK:      aie.dma_bd(%[[BUFFER_0_1]] : memref<4096xi32, 1 : i32>) {dimensions = #aie<bd_dim_layout_array[<size = 32, stride = 64>, <size = 32, stride = 1>]>, len = 1024 : i32, offset = 1024 : i32}
+// CHECK:      aie.use_lock(%[[LOCK_0_1]], Release, 1)
+// CHECK:      aie.next_bd ^bb2
+// CHECK:    ^bb2:
+// CHECK:      aie.use_lock(%[[LOCK_0_1_1]], AcquireGreaterEqual, 1)
+// CHECK:      aie.dma_bd(%[[BUFFER_0_1_0]] : memref<4096xi32, 1 : i32>) {dimensions = #aie<bd_dim_layout_array[<size = 32, stride = 64>, <size = 32, stride = 1>]>, len = 1024 : i32, offset = 1024 : i32}
+// CHECK:      aie.use_lock(%[[LOCK_0_1]], Release, 1)
+// CHECK:      aie.next_bd ^bb1
+// CHECK:    ^bb3:
+// CHECK:      aie.end
+// CHECK:    }
+// CHECK:    %[[MEM_0_2:.*]] = aie.mem(%[[TILE_0_2]]) {
+// CHECK:      %[[VAL_1:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
+// CHECK:    ^bb1:
+// CHECK:      aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1)
+// CHECK:      aie.dma_bd(%[[BUFFER_0_2]] : memref<4096xi32, 2 : i32>) {dimensions = #aie<bd_dim_layout_array[<size = 32, stride = 128>, <size = 32, stride = 1>]>, len = 1024 : i32}
+// CHECK:      aie.use_lock(%[[LOCK_0_2_3]], Release, 1)
+// CHECK:      aie.next_bd ^bb2
+// CHECK:    ^bb2:
+// CHECK:      aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1)
+// CHECK:      aie.dma_bd(%[[BUFFER_0_2_2]] : memref<4096xi32, 2 : i32>) {dimensions = #aie<bd_dim_layout_array[<size = 32, stride = 128>, <size = 32, stride = 1>]>, len = 1024 : i32}
+// CHECK:      aie.use_lock(%[[LOCK_0_2_3]], Release, 1)
+// CHECK:      aie.next_bd ^bb1
+// CHECK:    ^bb3:
+// CHECK:      aie.end
+// CHECK:    }
+// CHECK:    aiex.runtime_sequence @single_connection_multi_buffer
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @tile_and_core_and_acquire_broadcast() {
+  func.func @single_connection_multi_buffer() {
     amdaie.workgroup {
       %c0 = arith.constant 0 : index
       %c1 = arith.constant 1 : index
       %c2 = arith.constant 2 : index
-      %tile_0_0 = amdaie.tile(%c0, %c0)
       %tile_0_1 = amdaie.tile(%c0, %c1)
       %tile_0_2 = amdaie.tile(%c0, %c2)
-      %tile_1_2 = amdaie.tile(%c1, %c2)
-      %alloc_0 = memref.alloc() : memref<32x64xi32>
-      %alloc_1 = memref.alloc() : memref<32x32xi32, 1>
-      %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2>
-      %obj0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
-      %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
-      %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2, %tile_1_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-      %conn0 = amdaie.connection(%obj1, %obj0) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<2048xi32>>)
-      %conn1 = amdaie.connection(%obj2, %obj1) : (!amdaie.logicalobjectfifo<memref<1024xi32, 2>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
-      %core_0_2 = amdaie.core(%tile_0_2, in : [%conn1], out : []) {
-        %0 = amdaie.logicalobjectfifo.acquire(%conn1, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
+      %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32>
+      %lock = amdaie.lock(%tile_0_1(0), 2)
+      %lock_1 = amdaie.lock(%tile_0_1(1), 0)
+      %buffer_2 = amdaie.buffer(%tile_0_2) : memref<4096xi32, 2 : i32>
+      %buffer_3 = amdaie.buffer(%tile_0_2) : memref<4096xi32, 2 : i32>
+      %lock_2 = amdaie.lock(%tile_0_2(0), 2)
+      %lock_3 = amdaie.lock(%tile_0_2(1), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32>, memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 2>
+      %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_2, %buffer_3}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 2>
+      %channel = amdaie.channel(%tile_0_1, 0)
+      %channel_1 = amdaie.channel(%tile_0_2, 0)
+      %2 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 2>, !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 2>)
+      amdaie.controlcode {
+        %3 = amdaie.npu.circular_dma_cpy_nd %2([0, 0] [32, 32] [128, 1], [0, 1024] [32, 32] [64, 1])
         amdaie.end
       }
-      %core_1_2 = amdaie.core(%tile_1_2, in : [%conn1], out : []) {
-        %0 = amdaie.logicalobjectfifo.acquire(%conn1, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK:  aie.device(npu1_4col)
+// CHECK:    %[[TILE_0_2:.*]] = aie.tile(0, 2)
+// CHECK:    %[[TILE_0_1:.*]] = aie.tile(0, 1)
+// CHECK:    %[[BUFFER_0_1:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_0"} : memref<4096xi32, 1 : i32>
+// CHECK:    %[[LOCK_0_1:.*]] = aie.lock(%[[TILE_0_1]], 0) {init = 1 : i8, sym_name = "lock_0"}
+// CHECK:    %[[LOCK_0_1_0:.*]] = aie.lock(%[[TILE_0_1]], 1) {init = 0 : i8, sym_name = "lock_1"}
+// CHECK:    %[[BUFFER_0_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_1"} : memref<4096xi32, 2 : i32>
+// CHECK:    %[[LOCK_0_2:.*]] = aie.lock(%[[TILE_0_2]], 0) {init = 1 : i8, sym_name = "lock_2"}
+// CHECK:    %[[LOCK_0_2_1:.*]] = aie.lock(%[[TILE_0_2]], 1) {init = 0 : i8, sym_name = "lock_3"}
+// CHECK:    %[[BUFFER_0_1_2:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_2"} : memref<2048xi32, 1 : i32>
+// CHECK:    %[[LOCK_0_1_3:.*]] = aie.lock(%[[TILE_0_1]], 0) {init = 1 : i8, sym_name = "lock_4"}
+// CHECK:    %[[LOCK_0_1_4:.*]] = aie.lock(%[[TILE_0_1]], 1) {init = 0 : i8, sym_name = "lock_5"}
+// CHECK:    %[[BUFFER_0_2_5:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_3"} : memref<2048xi32, 2 : i32>
+// CHECK:    %[[LOCK_0_2_6:.*]] = aie.lock(%[[TILE_0_2]], 0) {init = 1 : i8, sym_name = "lock_6"}
+// CHECK:    %[[LOCK_0_2_7:.*]] = aie.lock(%[[TILE_0_2]], 1) {init = 0 : i8, sym_name = "lock_7"}
+// CHECK:    aie.flow(%[[TILE_0_1]], DMA : 0, %[[TILE_0_2]], DMA : 0)
+// CHECK:    aie.flow(%[[TILE_0_1]], DMA : 1, %[[TILE_0_2]], DMA : 1)
+// CHECK:    %[[MEMTILE_DMA_0_1:.*]] = aie.memtile_dma(%[[TILE_0_1]]) {
+// CHECK:      %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb2)
+// CHECK:    ^bb1:
+// CHECK:      aie.use_lock(%[[LOCK_0_1_0]], AcquireGreaterEqual, 1)
+// CHECK:      aie.dma_bd(%[[BUFFER_0_1]] : memref<4096xi32, 1 : i32>) {dimensions = #aie<bd_dim_layout_array[<size = 32, stride = 64>, <size = 32, stride = 1>]>, len = 1024 : i32}
+// CHECK:      aie.use_lock(%[[LOCK_0_1]], Release, 1)
+// CHECK:      aie.next_bd ^bb1
+// CHECK:    ^bb2:
+// CHECK:      %[[VAL_1:.*]] = aie.dma_start(MM2S, 1, ^bb3, ^bb4)
+// CHECK:    ^bb3:
+// CHECK:      aie.use_lock(%[[LOCK_0_1_4]], AcquireGreaterEqual, 1)
+// CHECK:      aie.dma_bd(%[[BUFFER_0_1_2]] : memref<2048xi32, 1 : i32>) {dimensions = #aie<bd_dim_layout_array[<size = 32, stride = 64>, <size = 32, stride = 1>]>, len = 1024 : i32, offset = 1024 : i32}
+// CHECK:      aie.use_lock(%[[LOCK_0_1_3]], Release, 1)
+// CHECK:      aie.next_bd ^bb3
+// CHECK:    ^bb4:
+// CHECK:      aie.end
+// CHECK:    }
+// CHECK:    %[[MEM_0_2:.*]] = aie.mem(%[[TILE_0_2]]) {
+// CHECK:      %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb2)
+// CHECK:    ^bb1:
+// CHECK:      aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1)
+// CHECK:      aie.dma_bd(%[[BUFFER_0_2]] : memref<4096xi32, 2 : i32>) {len = 0 : i32}
+// CHECK:      aie.use_lock(%[[LOCK_0_2_1]], Release, 1)
+// CHECK:      aie.next_bd ^bb1
+// CHECK:    ^bb2:
+// CHECK:      %[[VAL_3:.*]] = aie.dma_start(S2MM, 1, ^bb3, ^bb4)
+// CHECK:    ^bb3:
+// CHECK:      aie.use_lock(%[[LOCK_0_2_6]], AcquireGreaterEqual, 1)
+// CHECK:      aie.dma_bd(%[[BUFFER_0_2_5]] : memref<2048xi32, 2 : i32>) {len = 0 : i32}
+// CHECK:      aie.use_lock(%[[LOCK_0_2_7]], Release, 1)
+// CHECK:      aie.next_bd ^bb3
+// CHECK:    ^bb4:
+// CHECK:      aie.end
+// CHECK:    }
+// CHECK:    aiex.runtime_sequence @multi_connection_single_buffer
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @multi_connection_single_buffer() {
+    amdaie.workgroup {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c2 = arith.constant 2 : index
+      %tile_0_1 = amdaie.tile(%c0, %c1)
+      %tile_0_2 = amdaie.tile(%c0, %c2)
+      %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32>
+      %lock = amdaie.lock(%tile_0_1(0), 1)
+      %lock_1 = amdaie.lock(%tile_0_1(1), 0)
+      %buffer_1 = amdaie.buffer(%tile_0_2) : memref<4096xi32, 2 : i32>
+      %lock_2 = amdaie.lock(%tile_0_2(0), 1)
+      %lock_3 = amdaie.lock(%tile_0_2(1), 0)
+      %buffer_2 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
+      %lock_4 = amdaie.lock(%tile_0_1(0), 1)
+      %lock_5 = amdaie.lock(%tile_0_1(1), 0)
+      %buffer_3 = amdaie.buffer(%tile_0_2) : memref<2048xi32, 2 : i32>
+      %lock_6 = amdaie.lock(%tile_0_2(0), 1)
+      %lock_7 = amdaie.lock(%tile_0_2(1), 0) 
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>
+      %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 1>
+      %channel = amdaie.channel(%tile_0_1, 0)
+      %channel_1 = amdaie.channel(%tile_0_2, 0)
+      %2 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 1>, !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>)
+      %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_2}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 1>
+      %4 = amdaie.logicalobjectfifo.from_buffers({%buffer_3}, {%lock_6}, {%lock_7}) : memref<2048xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 2 : i32>, 1>
+      %channel_2 = amdaie.channel(%tile_0_1, 1)
+      %channel_3 = amdaie.channel(%tile_0_2, 1)
+      %5 = amdaie.connection(%4 {%channel_3}, %3 {%channel_2}) : (!amdaie.logicalobjectfifo<memref<2048xi32, 2 : i32>, 1>, !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 1>)
+      amdaie.controlcode {
+        %6 = amdaie.npu.circular_dma_cpy_nd %2([] [] [], [0, 0] [32, 32] [64, 1])
+        %7 = amdaie.npu.circular_dma_cpy_nd %5([] [] [], [0, 1024] [32, 32] [64, 1])
         amdaie.end
       }
-      memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2>
-      memref.dealloc %alloc_1 : memref<32x32xi32, 1>
-      memref.dealloc %alloc_0 : memref<32x64xi32>
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK:  aie.device(npu1_4col)
+// CHECK:    %[[TILE_0_2:.*]] = aie.tile(0, 2)
+// CHECK:    %[[TILE_0_1:.*]] = aie.tile(0, 1)
+// CHECK:    %[[BUFFER_0_1:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_0"} : memref<4096xi32, 1 : i32>
+// CHECK:    %[[BUFFER_0_1_0:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_1"} : memref<4096xi32, 1 : i32>
+// CHECK:    %[[LOCK_0_1:.*]] = aie.lock(%[[TILE_0_1]], 0) {init = 2 : i8, sym_name = "lock_0"}
+// CHECK:    %[[LOCK_0_1_1:.*]] = aie.lock(%[[TILE_0_1]], 1) {init = 0 : i8, sym_name = "lock_1"}
+// CHECK:    %[[BUFFER_0_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_2"} : memref<4096xi32, 2 : i32>
+// CHECK:    %[[BUFFER_0_2_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_3"} : memref<4096xi32, 2 : i32>
+// CHECK:    %[[LOCK_0_2:.*]] = aie.lock(%[[TILE_0_2]], 0) {init = 2 : i8, sym_name = "lock_2"}
+// CHECK:    %[[LOCK_0_2_3:.*]] = aie.lock(%[[TILE_0_2]], 1) {init = 0 : i8, sym_name = "lock_3"}
+// CHECK:    %[[BUFFER_0_1_4:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_4"} : memref<2048xi32, 1 : i32>
+// CHECK:    %[[BUFFER_0_1_5:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_5"} : memref<2048xi32, 1 : i32>
+// CHECK:    %[[BUFFER_0_1_6:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_6"} : memref<2048xi32, 1 : i32>
+// CHECK:    %[[BUFFER_0_1_7:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_7"} : memref<2048xi32, 1 : i32>
+// CHECK:    %[[LOCK_0_1_8:.*]] = aie.lock(%[[TILE_0_1]], 0) {init = 4 : i8, sym_name = "lock_4"}
+// CHECK:    %[[LOCK_0_1_9:.*]] = aie.lock(%[[TILE_0_1]], 1) {init = 0 : i8, sym_name = "lock_5"}
+// CHECK:    %[[BUFFER_0_2_10:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_8"} : memref<2048xi32, 2 : i32>
+// CHECK:    %[[BUFFER_0_2_11:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_9"} : memref<2048xi32, 2 : i32>
+// CHECK:    %[[BUFFER_0_2_12:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_10"} : memref<2048xi32, 2 : i32>
+// CHECK:    %[[BUFFER_0_2_13:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_11"} : memref<2048xi32, 2 : i32>
+// CHECK:    %[[LOCK_0_2_14:.*]] = aie.lock(%[[TILE_0_2]], 0) {init = 4 : i8, sym_name = "lock_6"}
+// CHECK:    %[[LOCK_0_2_15:.*]] = aie.lock(%[[TILE_0_2]], 1) {init = 0 : i8, sym_name = "lock_7"}
+// CHECK:    aie.flow(%[[TILE_0_1]], DMA : 0, %[[TILE_0_2]], DMA : 0)
+// CHECK:    aie.flow(%[[TILE_0_1]], DMA : 1, %[[TILE_0_2]], DMA : 1)
+// CHECK:    %[[MEMTILE_DMA_0_1:.*]] = aie.memtile_dma(%[[TILE_0_1]]) {
+// CHECK:      %[[VAL_0:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+// CHECK:    ^bb1:
+// CHECK:      aie.use_lock(%[[LOCK_0_1_1]], AcquireGreaterEqual, 1)
+// CHECK:      aie.dma_bd(%[[BUFFER_0_1]] : memref<4096xi32, 1 : i32>) {dimensions = #aie<bd_dim_layout_array[<size = 32, stride = 64>, <size = 32, stride = 1>]>, len = 1024 : i32}
+// CHECK:      aie.use_lock(%[[LOCK_0_1]], Release, 1)
+// CHECK:      aie.next_bd ^bb2
+// CHECK:    ^bb2:
+// CHECK:      aie.use_lock(%[[LOCK_0_1_1]], AcquireGreaterEqual, 1)
+// CHECK:      aie.dma_bd(%[[BUFFER_0_1_0]] : memref<4096xi32, 1 : i32>) {dimensions = #aie<bd_dim_layout_array[<size = 32, stride = 64>, <size = 32, stride = 1>]>, len = 1024 : i32}
+// CHECK:      aie.use_lock(%[[LOCK_0_1]], Release, 1)
+// CHECK:      aie.next_bd ^bb1
+// CHECK:    ^bb3:
+// CHECK:      %[[VAL_1:.*]] = aie.dma_start(MM2S, 1, ^bb4, ^bb8)
+// CHECK:    ^bb4:
+// CHECK:      aie.use_lock(%[[LOCK_0_1_9]], AcquireGreaterEqual, 1)
+// CHECK:      aie.dma_bd(%[[BUFFER_0_1_4]] : memref<2048xi32, 1 : i32>) {dimensions = #aie<bd_dim_layout_array[<size = 32, stride = 64>, <size = 32, stride = 1>]>, len = 1024 : i32, offset = 1024 : i32}
+// CHECK:      aie.use_lock(%[[LOCK_0_1_8]], Release, 1)
+// CHECK:      aie.next_bd ^bb5
+// CHECK:    ^bb5:
+// CHECK:      aie.use_lock(%[[LOCK_0_1_9]], AcquireGreaterEqual, 1)
+// CHECK:      aie.dma_bd(%[[BUFFER_0_1_5]] : memref<2048xi32, 1 : i32>) {dimensions = #aie<bd_dim_layout_array[<size = 32, stride = 64>, <size = 32, stride = 1>]>, len = 1024 : i32, offset = 1024 : i32}
+// CHECK:      aie.use_lock(%[[LOCK_0_1_8]], Release, 1)
+// CHECK:      aie.next_bd ^bb6
+// CHECK:    ^bb6:
+// CHECK:      aie.use_lock(%[[LOCK_0_1_9]], AcquireGreaterEqual, 1)
+// CHECK:      aie.dma_bd(%[[BUFFER_0_1_6]] : memref<2048xi32, 1 : i32>) {dimensions = #aie<bd_dim_layout_array[<size = 32, stride = 64>, <size = 32, stride = 1>]>, len = 1024 : i32, offset = 1024 : i32}
+// CHECK:      aie.use_lock(%[[LOCK_0_1_8]], Release, 1)
+// CHECK:      aie.next_bd ^bb7
+// CHECK:    ^bb7:
+// CHECK:      aie.use_lock(%[[LOCK_0_1_9]], AcquireGreaterEqual, 1)
+// CHECK:      aie.dma_bd(%[[BUFFER_0_1_7]] : memref<2048xi32, 1 : i32>) {dimensions = #aie<bd_dim_layout_array[<size = 32, stride = 64>, <size = 32, stride = 1>]>, len = 1024 : i32, offset = 1024 : i32}
+// CHECK:      aie.use_lock(%[[LOCK_0_1_8]], Release, 1)
+// CHECK:      aie.next_bd ^bb4
+// CHECK:    ^bb8:
+// CHECK:      aie.end
+// CHECK:    }
+// CHECK:    %[[MEM_0_2:.*]] = aie.mem(%[[TILE_0_2]]) {
+// CHECK:      %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
+// CHECK:    ^bb1:
+// CHECK:      aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1)
+// CHECK:      aie.dma_bd(%[[BUFFER_0_2]] : memref<4096xi32, 2 : i32>) {len = 0 : i32}
+// CHECK:      aie.use_lock(%[[LOCK_0_2_3]], Release, 1)
+// CHECK:      aie.next_bd ^bb2
+// CHECK:    ^bb2:
+// CHECK:      aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1)
+// CHECK:      aie.dma_bd(%[[BUFFER_0_2_2]] : memref<4096xi32, 2 : i32>) {len = 0 : i32}
+// CHECK:      aie.use_lock(%[[LOCK_0_2_3]], Release, 1)
+// CHECK:      aie.next_bd ^bb1
+// CHECK:    ^bb3:
+// CHECK:      %[[VAL_3:.*]] = aie.dma_start(S2MM, 1, ^bb4, ^bb8)
+// CHECK:    ^bb4:
+// CHECK:      aie.use_lock(%[[LOCK_0_2_14]], AcquireGreaterEqual, 1)
+// CHECK:      aie.dma_bd(%[[BUFFER_0_2_10]] : memref<2048xi32, 2 : i32>) {len = 0 : i32}
+// CHECK:      aie.use_lock(%[[LOCK_0_2_15]], Release, 1)
+// CHECK:      aie.next_bd ^bb5
+// CHECK:    ^bb5:
+// CHECK:      aie.use_lock(%[[LOCK_0_2_14]], AcquireGreaterEqual, 1)
+// CHECK:      aie.dma_bd(%[[BUFFER_0_2_11]] : memref<2048xi32, 2 : i32>) {len = 0 : i32}
+// CHECK:      aie.use_lock(%[[LOCK_0_2_15]], Release, 1)
+// CHECK:      aie.next_bd ^bb6
+// CHECK:    ^bb6:
+// CHECK:      aie.use_lock(%[[LOCK_0_2_14]], AcquireGreaterEqual, 1)
+// CHECK:      aie.dma_bd(%[[BUFFER_0_2_12]] : memref<2048xi32, 2 : i32>) {len = 0 : i32}
+// CHECK:      aie.use_lock(%[[LOCK_0_2_15]], Release, 1)
+// CHECK:      aie.next_bd ^bb7
+// CHECK:    ^bb7:
+// CHECK:      aie.use_lock(%[[LOCK_0_2_14]], AcquireGreaterEqual, 1)
+// CHECK:      aie.dma_bd(%[[BUFFER_0_2_13]] : memref<2048xi32, 2 : i32>) {len = 0 : i32}
+// CHECK:      aie.use_lock(%[[LOCK_0_2_15]], Release, 1)
+// CHECK:      aie.next_bd ^bb4
+// CHECK:    ^bb8:
+// CHECK:      aie.end
+// CHECK:    }
+// CHECK:    aiex.runtime_sequence @multi_connection_multi_buffer
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @multi_connection_multi_buffer() {
+    amdaie.workgroup {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c2 = arith.constant 2 : index
+      %tile_0_1 = amdaie.tile(%c0, %c1)
+      %tile_0_2 = amdaie.tile(%c0, %c2)
+      %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32>
+      %lock = amdaie.lock(%tile_0_1(0), 2)
+      %lock_1 = amdaie.lock(%tile_0_1(1), 0)
+      %buffer_2 = amdaie.buffer(%tile_0_2) : memref<4096xi32, 2 : i32>
+      %buffer_3 = amdaie.buffer(%tile_0_2) : memref<4096xi32, 2 : i32>
+      %lock_2 = amdaie.lock(%tile_0_2(0), 2)
+      %lock_3 = amdaie.lock(%tile_0_2(1), 0)
+      %buffer_4 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
+      %buffer_5 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
+      %buffer_6 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
+      %buffer_7 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
+      %lock_4 = amdaie.lock(%tile_0_1(0), 4)
+      %lock_5 = amdaie.lock(%tile_0_1(1), 0)
+      %buffer_8 = amdaie.buffer(%tile_0_2) : memref<2048xi32, 2 : i32>
+      %buffer_9 = amdaie.buffer(%tile_0_2) : memref<2048xi32, 2 : i32>
+      %buffer_10 = amdaie.buffer(%tile_0_2) : memref<2048xi32, 2 : i32>
+      %buffer_11 = amdaie.buffer(%tile_0_2) : memref<2048xi32, 2 : i32>
+      %lock_6 = amdaie.lock(%tile_0_2(0), 4)
+      %lock_7 = amdaie.lock(%tile_0_2(1), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32>, memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 2>
+      %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_2, %buffer_3}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 2>
+      %channel = amdaie.channel(%tile_0_1, 0)
+      %channel_1 = amdaie.channel(%tile_0_2, 0)
+      %2 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 2>, !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 2>)
+      %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_4, %buffer_5, %buffer_6, %buffer_7}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 4>
+      %4 = amdaie.logicalobjectfifo.from_buffers({%buffer_8, %buffer_9, %buffer_10, %buffer_11}, {%lock_6}, {%lock_7}) : memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 2 : i32>, 4>
+      %channel_2 = amdaie.channel(%tile_0_1, 1)
+      %channel_3 = amdaie.channel(%tile_0_2, 1)
+      %5 = amdaie.connection(%4 {%channel_3}, %3 {%channel_2}) : (!amdaie.logicalobjectfifo<memref<2048xi32, 2 : i32>, 4>, !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 4>)
       amdaie.controlcode {
-        %npu_dma_0 = amdaie.npu.circular_dma_cpy_nd %conn0([] [] [], [] [] [])
-        %npu_dma_1 = amdaie.npu.circular_dma_cpy_nd %conn1([] [] [], [] [] [])
+        %6 = amdaie.npu.circular_dma_cpy_nd %2([] [] [], [0, 0] [32, 32] [64, 1])
+        %7 = amdaie.npu.circular_dma_cpy_nd %5([] [] [], [0, 1024] [32, 32] [64, 1])
         amdaie.end
       }
     }
@@ -363,19 +541,51 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// NOTE: Due to an AIE check that verifies whether AIE operations exist inside a
-// core, it's hard to create a very small minimal test.
-//
-// CHECK:       aie.device
-// CHECK-DAG:   %[[TILE_0_2:.+]] = aie.tile(0, 2)
-// CHECK-DAG:   %{{.+}} = aie.tile(0, 1)
-// CHECK-DAG:   %{{.+}} = aie.tile(0, 0)
-// CHECK:       aie.core(%[[TILE_0_2]])
-// CHECK:         aie.objectfifo.release
-// CHECK:       aiex.runtime_sequence @tile_and_core_and_release
+// CHECK:   aie.device(npu1_4col) {
+// CHECK:     memref.global "public" @shim_0 : memref<4096xi32>
+// CHECK:     %[[TILE_0_2:.*]] = aie.tile(0, 2)
+// CHECK:     %[[TILE_0_1:.*]] = aie.tile(0, 1)
+// CHECK:     %[[TILE_0_0:.*]] = aie.tile(0, 0)
+// CHECK:     %[[BUFFER_0_1:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_0"} : memref<4096xi32, 1 : i32>
+// CHECK:     %[[LOCK_0_1:.*]] = aie.lock(%[[TILE_0_1]], 0) {init = 1 : i8, sym_name = "lock_0"}
+// CHECK:     %[[LOCK_0_1_0:.*]] = aie.lock(%[[TILE_0_1]], 1) {init = 0 : i8, sym_name = "lock_1"}
+// CHECK:     %[[BUFFER_0_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_1"} : memref<4096xi32, 2 : i32>
+// CHECK:     %[[LOCK_0_2:.*]] = aie.lock(%[[TILE_0_2]], 0) {init = 1 : i8, sym_name = "lock_2"}
+// CHECK:     %[[LOCK_0_2_1:.*]] = aie.lock(%[[TILE_0_2]], 1) {init = 0 : i8, sym_name = "lock_3"}
+// CHECK:     aie.flow(%[[TILE_0_0]], DMA : 0, %[[TILE_0_1]], DMA : 0)
+// CHECK:     aie.shim_dma_allocation @shim_0(MM2S, 0, 0)
+// CHECK:     aie.flow(%[[TILE_0_1]], DMA : 0, %[[TILE_0_2]], DMA : 0)
+// CHECK:     %[[MEMTILE_DMA_0_1:.*]] = aie.memtile_dma(%[[TILE_0_1]]) {
+// CHECK:       %[[VAL_0:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb2)
+// CHECK:     ^bb1:
+// CHECK:       aie.use_lock(%[[LOCK_0_1]], AcquireGreaterEqual, 1)
+// CHECK:       aie.dma_bd(%[[BUFFER_0_1]] : memref<4096xi32, 1 : i32>) {len = 1024 : i32}
+// CHECK:       aie.use_lock(%[[LOCK_0_1_0]], Release, 1)
+// CHECK:       aie.next_bd ^bb1
+// CHECK:     ^bb2:
+// CHECK:       %[[VAL_1:.*]] = aie.dma_start(MM2S, 0, ^bb3, ^bb4)
+// CHECK:     ^bb3:
+// CHECK:       aie.use_lock(%[[LOCK_0_1_0]], AcquireGreaterEqual, 1)
+// CHECK:       aie.dma_bd(%[[BUFFER_0_1]] : memref<4096xi32, 1 : i32>) {dimensions = #aie<bd_dim_layout_array[<size = 32, stride = 64>, <size = 32, stride = 1>]>, len = 1024 : i32, offset = 1024 : i32}
+// CHECK:       aie.use_lock(%[[LOCK_0_1]], Release, 1)
+// CHECK:       aie.next_bd ^bb3
+// CHECK:     ^bb4:
+// CHECK:       aie.end
+// CHECK:     }
+// CHECK:     %[[MEM_0_2:.*]] = aie.mem(%[[TILE_0_2]]) {
+// CHECK:       %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb2)
+// CHECK:     ^bb1:
+// CHECK:       aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1)
+// CHECK:       aie.dma_bd(%[[BUFFER_0_2]] : memref<4096xi32, 2 : i32>) {len = 0 : i32}
+// CHECK:       aie.use_lock(%[[LOCK_0_2_1]], Release, 1)
+// CHECK:       aie.next_bd ^bb1
+// CHECK:     ^bb2:
+// CHECK:       aie.end
+// CHECK:     }
+// CHECK:     aiex.runtime_sequence @single_connection_chain
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @tile_and_core_and_release() {
+  func.func @single_connection_chain() {
     amdaie.workgroup {
       %c0 = arith.constant 0 : index
       %c1 = arith.constant 1 : index
@@ -383,24 +593,24 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %tile_0_0 = amdaie.tile(%c0, %c0)
       %tile_0_1 = amdaie.tile(%c0, %c1)
       %tile_0_2 = amdaie.tile(%c0, %c2)
-      %alloc_0 = memref.alloc() : memref<32x64xi32>
-      %alloc_1 = memref.alloc() : memref<32x32xi32, 1>
-      %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2>
-      %obj0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
-      %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
-      %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-      %conn0 = amdaie.connection(%obj1, %obj2) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<1024xi32, 2>>)
-      %conn1 = amdaie.connection(%obj0, %obj1) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
-      %core_0_0 = amdaie.core(%tile_0_2, in : [], out : [%conn0]) {
-        amdaie.logicalobjectfifo.release(%conn0, Produce) {size = 1 : i32}
-        amdaie.end
-      }
-      memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2>
-      memref.dealloc %alloc_1 : memref<32x32xi32, 1>
-      memref.dealloc %alloc_0 : memref<32x64xi32>
+      %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32>
+      %lock = amdaie.lock(%tile_0_1(0), 1)
+      %lock_1 = amdaie.lock(%tile_0_1(1), 0)
+      %buffer_1 = amdaie.buffer(%tile_0_2) : memref<4096xi32, 2 : i32>
+      %lock_2 = amdaie.lock(%tile_0_2(0), 1)
+      %lock_3 = amdaie.lock(%tile_0_2(1), 0)
+      %0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<4096xi32>>
+      %1 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>
+      %2 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 1>
+      %channel = amdaie.channel(%tile_0_0, 0)
+      %channel_1 = amdaie.channel(%tile_0_1, 0)
+      %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>, !amdaie.logicalobjectfifo<memref<4096xi32>, 1>)
+      %channel_2 = amdaie.channel(%tile_0_1, 0)
+      %channel_3 = amdaie.channel(%tile_0_2, 0)
+      %4 = amdaie.connection(%2 {%channel_3}, %1 {%channel_2}) : (!amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 1>, !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>)
       amdaie.controlcode {
-        %npu_dma_0 = amdaie.npu.circular_dma_cpy_nd %conn0([] [] [], [] [] [])
-        %npu_dma_1 = amdaie.npu.circular_dma_cpy_nd %conn1([] [] [], [] [] [])
+        %5 = amdaie.npu.circular_dma_cpy_nd %3([0] [1024] [1], [0, 1024] [32, 32] [64, 1])
+        %6 = amdaie.npu.circular_dma_cpy_nd %4([] [] [], [0, 1024] [32, 32] [64, 1])
         amdaie.end
       }
     }
@@ -410,6 +620,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
+//===----------------------------------------------------------------------===//
+// Controlcode tests
+//===----------------------------------------------------------------------===//
+
 #pipeline_layout = #hal.pipeline.layout<bindings = [<storage_buffer>]>
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
@@ -418,31 +632,23 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %c0 = arith.constant 0 : index
       %c1 = arith.constant 1 : index
       %c2 = arith.constant 2 : index
-      %c32 = arith.constant 32 : index
-      %c64 = arith.constant 64 : index
-      %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<32x64xi32>
-      memref.assume_alignment %2, 64 : memref<32x64xi32>
+      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<4096xi32>
       %tile_0_0 = amdaie.tile(%c0, %c0)
       %tile_0_1 = amdaie.tile(%c0, %c1)
-      %tile_0_2 = amdaie.tile(%c0, %c2)
-      %alloc_1 = memref.alloc() : memref<32x32xi32, 1>
-      %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2>
-      %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<2048xi32>>
-      %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
-      %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-      %conn0 = amdaie.connection(%obj1, %obj2) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<1024xi32, 2>>)
-      %conn1 = amdaie.connection(%placeholder, %obj1) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
-      amdaie.logicalobjectfifo.link[%conn0] -> [%conn1] ()
-      memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2>
-      memref.dealloc %alloc_1 : memref<32x32xi32, 1>
+      %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32>
+      %lock = amdaie.lock(%tile_0_1(0), 1)
+      %lock_1 = amdaie.lock(%tile_0_1(1), 0)
+      %1 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<4096xi32>>
+      %2 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>
+      %channel = amdaie.channel(%tile_0_0, 0)
+      %channel_1 = amdaie.channel(%tile_0_1, 0)
+      %3 = amdaie.connection(%2 {%channel_1}, %1 {%channel}) : (!amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>, !amdaie.logicalobjectfifo<memref<4096xi32>, 1>)
       // expected-error @+1 {{could not convert to AIEDialect ops}}
       amdaie.controlcode {
-        %npu_dma_0 = amdaie.npu.circular_dma_cpy_nd %conn0([] [] [], [] [] [])
-        %npu_dma_1 = amdaie.npu.circular_dma_cpy_nd %conn1([] [] [], [] [] [])
-        %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
-        // expected-error @+1 {{'amdaie.npu.dma_cpy_nd' op must have a target BD ID op to lower to the AIE dialect}}
-        %npu_dma_2 = amdaie.npu.dma_cpy_nd %conn1(%obj0[%c0, %c32] [%c32, %c32] [%c64, %c1], [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
-        amdaie.npu.dma_wait(%npu_dma_2, S2MM)
+        %4 = amdaie.npu.circular_dma_cpy_nd %3([0] [1024] [1], [] [] [])
+        %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo<memref<4096xi32>>
+        // expected-error @+1 {{'amdaie.npu.dma_cpy_nd' op must have a source BD ID op to lower to the AIE dialect}}
+        %6 = amdaie.npu.dma_cpy_nd %3([] [] [], %5[0, 32] [32, 32] [64, 1]) : source_type = !amdaie.logicalobjectfifo<memref<4096xi32>>
         amdaie.end
       }
     }
@@ -453,8 +659,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // -----
 
 // CHECK:       aie.device
-// CHECK:       aiex.runtime_sequence @npu_dma_cpy_nd_with_repeat_already_on_outer_dim(%[[ARG0:.+]]: memref<32x64xi32>
-// CHECK:       aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 32][2, 1, 2, 32][2, 0, 16, 1])
+// CHECK:       aiex.runtime_sequence @npu_dma_cpy_nd_with_repeat_already_on_outer_dim(%[[ARG0:.+]]: memref<4096xi32>
+// CHECK:       aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 32][2, 1, 2, 32][2, 0, 16, 1]) {
+// CHECK-SAME:  id = 0 : i64
 #pipeline_layout = #hal.pipeline.layout<bindings = [<storage_buffer>]>
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
@@ -463,27 +670,22 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %c0 = arith.constant 0 : index
       %c1 = arith.constant 1 : index
       %c2 = arith.constant 2 : index
-      %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<32x64xi32>
-      memref.assume_alignment %2, 64 : memref<32x64xi32>
+      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<4096xi32>
       %tile_0_0 = amdaie.tile(%c0, %c0)
       %tile_0_1 = amdaie.tile(%c0, %c1)
-      %tile_0_2 = amdaie.tile(%c0, %c2)
       %bd_id_0 = amdaie.bd_id(%tile_0_0, 0)
-      %alloc_1 = memref.alloc() : memref<32x32xi32, 1>
-      %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2>
-      %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<2048xi32>>
-      %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
-      %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-      %conn0 = amdaie.connection(%obj1, %obj2) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<1024xi32, 2>>)
-      %conn1 = amdaie.connection(%placeholder, %obj1) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
-      amdaie.logicalobjectfifo.link[%conn0] -> [%conn1] ()
-      memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2>
-      memref.dealloc %alloc_1 : memref<32x32xi32, 1>
+      %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32>
+      %lock = amdaie.lock(%tile_0_1(0), 1)
+      %lock_1 = amdaie.lock(%tile_0_1(1), 0)
+      %1 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<4096xi32>>
+      %2 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>
+      %channel = amdaie.channel(%tile_0_0, 0)
+      %channel_1 = amdaie.channel(%tile_0_1, 0)
+      %3 = amdaie.connection(%2 {%channel_1}, %1 {%channel}) : (!amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>, !amdaie.logicalobjectfifo<memref<4096xi32>, 1>)
       amdaie.controlcode {
-        %npu_dma_0 = amdaie.npu.circular_dma_cpy_nd %conn0([] [] [], [] [] [])
-        %npu_dma_1 = amdaie.npu.circular_dma_cpy_nd %conn1([] [] [], [] [] [])
-        %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
-        %npu_dma_2 = amdaie.npu.dma_cpy_nd %conn1(%obj0[0, 0, 0, 32] [2, 1, 2, 32] [2, 0, 16, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %4 = amdaie.npu.circular_dma_cpy_nd %3([0] [1024] [1], [] [] [])
+        %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo<memref<4096xi32>>
+        %6 = amdaie.npu.dma_cpy_nd %3([] [] [], %5[0, 0, 0, 32] [2, 1, 2, 32] [2, 0, 16, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo<memref<4096xi32>>
         amdaie.end
       }
     }
@@ -495,73 +697,64 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // Test to show mix of implicit/explicit source/target addressing in amdaie.npu.dma_cpy_nd.
 
-// CHECK:       aie.device
-// CHECK-DAG:   %[[TILE_0_2:.+]] = aie.tile(0, 2)
-// CHECK-DAG:   %[[TILE_0_1:.+]] = aie.tile(0, 1)
-// CHECK-DAG:   %[[TILE_0_0:.+]] = aie.tile(0, 0)
-// CHECK:       aie.objectfifo @[[OBJ0:.+]](%[[TILE_0_2]], {%[[TILE_0_1]]}
-// CHECK-NEXT:  aie.objectfifo @[[OBJ1:.+]](%[[TILE_0_1]], {%[[TILE_0_0]]}
-// CHECK-NEXT:  aie.objectfifo @[[OBJ2:.+]](%[[TILE_0_0]], {%[[TILE_0_1]]}
-// CHECK:       aie.objectfifo.link [@[[OBJ0]]] -> [@[[OBJ1]]]
-// CHECK:       aiex.runtime_sequence @controlcode(%[[ARG0:.+]]: memref<32x64xi32>
-// CHECK:         aiex.npu.dma_memcpy_nd
-// CHECK-SAME:            %[[ARG0]][0, 0, 0, 32][1, 1, 32, 32][0, 0, 64, 1]
-// CHECK-SAME:            issue_token = true
-// CHECK-SAME:            metadata = @[[OBJ1]]
-// CHECK-NEXT:    aiex.npu.dma_wait {symbol = @[[OBJ1]]}
-// CHECK:         aiex.npu.dma_memcpy_nd
-// CHECK-SAME:            %[[ARG0]][0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]
-// CHECK-SAME:            issue_token = true
-// CHECK-SAME:            metadata = @[[OBJ1]]
-// CHECK-NEXT:    aiex.npu.dma_wait {symbol = @[[OBJ1]]}
-// CHECK:         aiex.npu.dma_memcpy_nd
-// CHECK-SAME:            %[[ARG0]][0, 0, 0, 32][1, 1, 32, 32][0, 0, 64, 1]
-// CHECK-SAME:            issue_token = true
-// CHECK-SAME:            metadata = @[[OBJ2]]
-// CHECK-NEXT:    aiex.npu.dma_wait {symbol = @[[OBJ2]]}
-// CHECK:         aiex.npu.dma_memcpy_nd
-// CHECK-SAME:            %[[ARG0]][0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0, 1]
-// CHECK-SAME:            issue_token = true
-// CHECK-SAME:            metadata = @[[OBJ2]]
-// CHECK-NEXT:    aiex.npu.dma_wait {symbol = @[[OBJ2]]}
-#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+// CHECK:   aie.device
+// CHECK:   memref.global "public" @[[SHIM_1:.+]] : memref<2048xi32>
+// CHECK:   memref.global "public" @[[SHIM_0:.+]] : memref<4096xi32>
+// CHECK:   aiex.runtime_sequence @controlcode(%[[ARG0:.+]]: memref<4096xi32>, %[[ARG1:.+]]: memref<2048xi32>)
+// CHECK:   aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 32][1, 1, 32, 32][0, 0, 64, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_0]]} : memref<4096xi32>
+// CHECK:   aiex.npu.dma_wait {symbol = @[[SHIM_0]]}
+// CHECK:   aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_0]]} : memref<4096xi32>
+// CHECK:   aiex.npu.dma_wait {symbol = @[[SHIM_0]]}
+// CHECK:   scf.forall
+// CHECK:     aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 32][1, 1, 32, 32][0, 0, 64, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_1]]} : memref<2048xi32>
+// CHECK:     aiex.npu.dma_wait {symbol = @[[SHIM_1]]}
+// CHECK:     aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_1]]} : memref<2048xi32>
+// CHECK:     aiex.npu.dma_wait {symbol = @[[SHIM_1]]}
+// CHECK:   }
 #pipeline_layout = #hal.pipeline.layout<bindings = [<storage_buffer>]>
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @controlcode() {
-    %c2 = arith.constant 2 : index
-    %c1 = arith.constant 1 : index
-    %c0 = arith.constant 0 : index
     amdaie.workgroup {
-      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<32x64xi32>
-      memref.assume_alignment %0, 64 : memref<32x64xi32>
-      %tile = amdaie.tile(%c0, %c0)
-      %tile_0 = amdaie.tile(%c0, %c1)
-      %tile_1 = amdaie.tile(%c0, %c2)
-      %bd_id = amdaie.bd_id(%tile, 0)
-      %alloc = memref.alloc() : memref<32x32xi32, 1>
-      %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2>
-      %1 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo<memref<2048xi32>>
-      %2 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_0} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
-      %3 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_1} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-      %4 = amdaie.connection(%2, %3) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<1024xi32, 2>>)
-      %5 = amdaie.connection(%1, %2) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
-      %6 = amdaie.connection(%2, %1) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<2048xi32>>)
-      amdaie.logicalobjectfifo.link[%4] -> [%5] ()
-      memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2>
-      memref.dealloc %alloc : memref<32x32xi32, 1>
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c2 = arith.constant 2 : index
+      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<4096xi32>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<2048xi32>
+      %tile_0_0 = amdaie.tile(%c0, %c0)
+      %tile_0_1 = amdaie.tile(%c0, %c1)
+      %bd_id_0 = amdaie.bd_id(%tile_0_0, 0)
+      %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32>
+      %lock = amdaie.lock(%tile_0_1(0), 1)
+      %lock_1 = amdaie.lock(%tile_0_1(1), 0)
+      %buffer_1 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
+      %lock_2 = amdaie.lock(%tile_0_1(0), 1)
+      %lock_3 = amdaie.lock(%tile_0_1(1), 0)
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<4096xi32>>
+      %3 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>
+      %channel = amdaie.channel(%tile_0_0, 0)
+      %channel_1 = amdaie.channel(%tile_0_1, 0)
+      %4 = amdaie.connection(%3 {%channel_1}, %2 {%channel}) : (!amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>, !amdaie.logicalobjectfifo<memref<4096xi32>, 1>)
+      %5 = amdaie.logicalobjectfifo.placeholder{%tile_0_1} : !amdaie.logicalobjectfifo<memref<2048xi32>>
+      %6 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 1>
+      %channel_2 = amdaie.channel(%tile_0_0, 1)
+      %channel_3 = amdaie.channel(%tile_0_1, 1)
+      %7 = amdaie.connection(%5 {%channel_2}, %6 {%channel_3}) : (!amdaie.logicalobjectfifo<memref<2048xi32>, 1>, !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 1>)
       amdaie.controlcode {
-        %7 = amdaie.npu.circular_dma_cpy_nd %4([] [] [], [] [] [])
-        %8 = amdaie.npu.circular_dma_cpy_nd %5([] [] [], [] [] [])
-        %9 = amdaie.npu.circular_dma_cpy_nd %6([] [] [], [] [] [])
-        %10 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<32x64xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
-        %11 = amdaie.npu.dma_cpy_nd %5(%10[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
-        amdaie.npu.dma_wait(%11, S2MM)
-        %12 = amdaie.npu.dma_cpy_nd %5(%10[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
-        amdaie.npu.dma_wait(%12, S2MM)
-        %13 = amdaie.npu.dma_cpy_nd %6([] [] [], %10[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %8 = amdaie.npu.circular_dma_cpy_nd %4([0] [4096] [1], [] [] [])
+        %9 = amdaie.npu.circular_dma_cpy_nd %7([] [] [], [0] [2048] [1])
+        %10 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo<memref<4096xi32>>
+        %11 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<2048xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %12 = amdaie.npu.dma_cpy_nd %4([] [] [], %10[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo<memref<4096xi32>>
+        amdaie.npu.dma_wait(%12, MM2S)
+        %13 = amdaie.npu.dma_cpy_nd %4([] [] [], %10[0, 0, 0, 0] [1, 1, 1, 2048] [0, 0, 0, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo<memref<4096xi32>>
         amdaie.npu.dma_wait(%13, MM2S)
-        %14 = amdaie.npu.dma_cpy_nd %6([] [] [], %10[0, 0, 0, 0] [1, 1, 1, 2048] [0, 0, 0, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
-        amdaie.npu.dma_wait(%14, MM2S)
+        scf.forall (%arg0, %arg1) in (2, 1) {
+          %14 = amdaie.npu.dma_cpy_nd %7(%11[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
+          amdaie.npu.dma_wait(%14, S2MM)
+          %15 = amdaie.npu.dma_cpy_nd %7(%11[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
+          amdaie.npu.dma_wait(%15, S2MM)
+        }
         amdaie.end
       }
     }
@@ -571,71 +764,89 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// CHECK:       aie.device(npu1_4col) {
-// CHECK-DAG:     %[[TILE_0_0:.*]] = aie.tile(0, 0)
-// CHECK-DAG:     %[[TILE_0_1:.*]] = aie.tile(0, 1)
-// CHECK-DAG:     %[[TILE_1_0:.*]] = aie.tile(1, 0)
-// CHECK:         aie.objectfifo @[[OBJ0:.*]](%[[TILE_0_0]], {%[[TILE_0_1]]}, 2 : i32) : !aie.objectfifo<memref<1024xbf16, 1 : i32>>
-// CHECK:         aie.objectfifo @[[OBJ1:.*]](%[[TILE_1_0]], {%[[TILE_0_1]]}, 2 : i32) : !aie.objectfifo<memref<1024xbf16, 1 : i32>>
-// CHECK:         aie.objectfifo @[[OBJ2:.*]](%[[TILE_0_1]]
-// CHECK-SAME:    {%[[TILE_1_0]]}, 2 : i32) : !aie.objectfifo<memref<1024xf32>>
-// CHECK:         aiex.runtime_sequence @bf16_f32_lit_test
-// CHECK-SAME:         (%[[LHS:.*]]: memref<32x32xbf16>, %[[RHS:.*]]: memref<32x32xbf16>, %[[OUT:.*]]: memref<32x32xf32>) {
-// CHECK:           aiex.npu.dma_memcpy_nd
-// CHECK-SAME:          %[[OUT]][0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]
-// CHECK-SAME:          issue_token = true
-// CHECK-SAME:          metadata = @[[OBJ2]]
-// CHECK-SAME:          memref<32x32xf32>
-// CHECK:           aiex.npu.dma_memcpy_nd
-// CHECK-SAME:          %[[RHS]][0, 0, 1, 2][1, 2, 32, 16][0, 16, 32, 1]
-// CHECK-SAME:          metadata = @[[OBJ1]]
-// CHECK-SAME:          memref<32x32xbf16>
-// CHECK:           aiex.npu.dma_memcpy_nd
-// CHECK-SAME:          %[[LHS]][0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]
-// CHECK-SAME:          metadata = @[[OBJ0]]
-// CHECK-SAME:          memref<32x32xbf16>
+// CHECK:   aie.device
+// CHECK:   memref.global "public" @[[SHIM_1:.+]] : memref<2048xf32>
+// CHECK:   memref.global "public" @[[SHIM_0:.+]] : memref<4096xbf16>
+// CHECK:   aiex.runtime_sequence @controlcode_bf16_f32(%[[ARG0:.+]]: memref<4096xbf16>, %[[ARG1:.+]]: memref<2048xf32>)
+// CHECK:   aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 1, 2][1, 2, 32, 16][0, 16, 32, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_0]]} : memref<4096xbf16>
+// CHECK:   aiex.npu.dma_wait {symbol = @[[SHIM_0]]}
+// CHECK:   aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_0]]} : memref<4096xbf16>
+// CHECK:   aiex.npu.dma_wait {symbol = @[[SHIM_0]]}
+// CHECK:   aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 32][1, 1, 32, 32][0, 0, 64, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_1]]} : memref<2048xf32>
+// CHECK:   aiex.npu.dma_wait {symbol = @[[SHIM_1]]}
+// CHECK:   aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_1]]} : memref<2048xf32>
+// CHECK:   aiex.npu.dma_wait {symbol = @[[SHIM_1]]}
+#pipeline_layout = #hal.pipeline.layout<bindings = [<storage_buffer>]>
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
-#pipeline_layout = #hal.pipeline.layout<bindings = [<storage_buffer>, <storage_buffer>, <storage_buffer>]>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @bf16_f32_lit_test() {
-    %c1 = arith.constant 1 : index
-    %c0 = arith.constant 0 : index
+  func.func @controlcode_bf16_f32() {
+    amdaie.workgroup {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c2 = arith.constant 2 : index
+      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<4096xbf16>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<2048xf32>
+      %tile_0_0 = amdaie.tile(%c0, %c0)
+      %tile_0_1 = amdaie.tile(%c0, %c1)
+      %bd_id_0 = amdaie.bd_id(%tile_0_0, 0)
+      %buffer = amdaie.buffer(%tile_0_1) : memref<4096xbf16, 1 : i32>
+      %lock = amdaie.lock(%tile_0_1(0), 1)
+      %lock_1 = amdaie.lock(%tile_0_1(1), 0)
+      %buffer_1 = amdaie.buffer(%tile_0_1) : memref<2048xf32, 1 : i32>
+      %lock_2 = amdaie.lock(%tile_0_1(0), 1)
+      %lock_3 = amdaie.lock(%tile_0_1(1), 0)
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<4096xbf16>>
+      %3 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4096xbf16, 1 : i32>, 1>
+      %channel = amdaie.channel(%tile_0_0, 0)
+      %channel_1 = amdaie.channel(%tile_0_1, 0)
+      %4 = amdaie.connection(%3 {%channel_1}, %2 {%channel}) : (!amdaie.logicalobjectfifo<memref<4096xbf16, 1 : i32>, 1>, !amdaie.logicalobjectfifo<memref<4096xbf16>, 1>)
+      %5 = amdaie.logicalobjectfifo.placeholder{%tile_0_1} : !amdaie.logicalobjectfifo<memref<2048xf32>>
+      %6 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<2048xf32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xf32, 1 : i32>, 1>
+      %channel_2 = amdaie.channel(%tile_0_0, 1)
+      %channel_3 = amdaie.channel(%tile_0_1, 1)
+      %7 = amdaie.connection(%5 {%channel_2}, %6 {%channel_3}) : (!amdaie.logicalobjectfifo<memref<2048xf32>, 1>, !amdaie.logicalobjectfifo<memref<2048xf32, 1 : i32>, 1>)
+      amdaie.controlcode {
+        %8 = amdaie.npu.circular_dma_cpy_nd %4([0] [4096] [1], [] [] [])
+        %9 = amdaie.npu.circular_dma_cpy_nd %7([] [] [], [0] [2048] [1])
+        %10 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xbf16> -> !amdaie.logicalobjectfifo<memref<4096xbf16>>
+        %11 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<2048xf32> -> !amdaie.logicalobjectfifo<memref<2048xf32>>
+        %12 = amdaie.npu.dma_cpy_nd %4([] [] [], %10[0, 0, 1, 2] [1, 2, 32, 16] [0, 16, 32, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo<memref<4096xbf16>>
+        amdaie.npu.dma_wait(%12, MM2S)
+        %13 = amdaie.npu.dma_cpy_nd %4([] [] [], %10[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo<memref<4096xbf16>>
+        amdaie.npu.dma_wait(%13, MM2S)
+        %14 = amdaie.npu.dma_cpy_nd %7(%11[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xf32>>
+        amdaie.npu.dma_wait(%14, S2MM)
+        %15 = amdaie.npu.dma_cpy_nd %7(%11[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xf32>>
+        amdaie.npu.dma_wait(%15, S2MM)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// CoreOp tests
+//===----------------------------------------------------------------------===//
+
+// CHECK:   aie.device
+// CHECK:     %[[TILE_0_2:.+]] = aie.tile(0, 2)
+// CHECK:     aie.core(%[[TILE_0_2]]) {
+// CHECK:       aie.end
+// CHECK:     }
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @core() {
     amdaie.workgroup {
-      %alloc = memref.alloc() : memref<2x2x16x16xf32, 1 : i32>
-      %alloc_0 = memref.alloc() : memref<1x2x32x16xbf16, 1 : i32>
-      %tile = amdaie.tile(%c0, %c1)
-      %0 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x2x16x16xf32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x16x16xf32, 1 : i32>, 2>
-      %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x16xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x16xbf16, 1 : i32>, 2>
-      %2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x16xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x16x32xbf16, 1 : i32>, 2>
-      %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<32x32xbf16>
-      %tile_1 = amdaie.tile(%c0, %c0)
-      %tile_2 = amdaie.tile(%c1, %c0)
-      %bd_id = amdaie.bd_id(%tile_1, 2)
-      %bd_id_3 = amdaie.bd_id(%tile_1, 1)
-      %bd_id_4 = amdaie.bd_id(%tile_1, 0)
-      %4 = amdaie.logicalobjectfifo.placeholder{%tile_1} : !amdaie.logicalobjectfifo<memref<32x32xbf16>>
-      memref.assume_alignment %3, 64 : memref<32x32xbf16>
-      %5 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32x32xbf16>
-      %6 = amdaie.logicalobjectfifo.placeholder{%tile_2} : !amdaie.logicalobjectfifo<memref<32x32xbf16>>
-      memref.assume_alignment %5, 64 : memref<32x32xbf16>
-      %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : memref<32x32xf32>
-      %8 = amdaie.logicalobjectfifo.placeholder{%tile_2} : !amdaie.logicalobjectfifo<memref<1024xf32>>
-      %9 = amdaie.connection(%2, %4) : (!amdaie.logicalobjectfifo<memref<2x1x16x32xbf16, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<32x32xbf16>>)
-      %10 = amdaie.connection(%1, %6) : (!amdaie.logicalobjectfifo<memref<1x2x32x16xbf16, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<32x32xbf16>>)
-      %11 = amdaie.connection(%8, %0) : (!amdaie.logicalobjectfifo<memref<1024xf32>>, !amdaie.logicalobjectfifo<memref<2x2x16x16xf32, 1 : i32>, 2>)
+      %c0 = arith.constant 0 : index
+      %c2 = arith.constant 2 : index
+      %tile_0_2 = amdaie.tile(%c0, %c2)
+      %core_0_0 = amdaie.core(%tile_0_2, in : [], out : []) {
+        amdaie.end
+      }
       amdaie.controlcode {
-        %12 = amdaie.npu.circular_dma_cpy_nd %9([] [] [], [] [] [])
-        %13 = amdaie.npu.circular_dma_cpy_nd %10([] [] [], [] [] [])
-        %14 = amdaie.npu.circular_dma_cpy_nd %11([] [] [], [0, 0, 0, 0] [2, 16, 2, 16] [512, 16, 256, 1])
-        %15 = amdaie.logicalobjectfifo.from_memref %3, {%tile_1} : memref<32x32xbf16> -> !amdaie.logicalobjectfifo<memref<32x32xbf16>>
-        %16 = amdaie.logicalobjectfifo.from_memref %5, {%tile_1} : memref<32x32xbf16> -> !amdaie.logicalobjectfifo<memref<32x32xbf16>>
-        %17 = amdaie.logicalobjectfifo.from_memref %7, {%tile_1} : memref<32x32xf32> -> !amdaie.logicalobjectfifo<memref<1024xf32>>
-        %18 = amdaie.npu.dma_cpy_nd %11(%17[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_4, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<1024xf32>>
-        %19 = amdaie.npu.dma_cpy_nd %10([] [] [], %16[0, 0, 1, 2] [1, 2, 32, 16] [0, 16, 32, 1] bd_id = %bd_id_3) : source_type = !amdaie.logicalobjectfifo<memref<32x32xbf16>>
-        %20 = amdaie.npu.dma_cpy_nd %9([] [] [], %15[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo<memref<32x32xbf16>>
-        amdaie.npu.dma_wait(%18, S2MM)
-        amdaie.npu.dma_wait(%19, MM2S)
-        amdaie.npu.dma_wait(%20, MM2S)
         amdaie.end
       }
     }
@@ -645,102 +856,285 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// CHECK:       aie.device
-// CHECK-DAG:   %[[TILE_1_2:.+]] = aie.tile(1, 2)
-// CHECK-DAG:   %[[TILE_0_2:.+]] = aie.tile(0, 2)
-// CHECK-DAG:   %[[TILE_0_1:.+]] = aie.tile(0, 1)
-// CHECK-DAG:   %[[TILE_0_0:.+]] = aie.tile(0, 0)
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
-// CHECK:       aie.objectfifo @[[OBJ0:.+]](%[[TILE_0_0]], {%[[TILE_0_1]]}
-// CHECK-NEXT:  aie.objectfifo @[[OBJ1:.+]](%[[TILE_0_1]], {%[[TILE_0_2]], %[[TILE_1_2]]}
-// CHECK-NEXT:  aie.objectfifo.link
-// CHECK-SAME:  @[[OBJ0]]
-// CHECK-SAME:  @[[OBJ1]]
-// CHECK:       aie.core(%[[TILE_0_2]])
-// CHECK:         %[[ACQUIRE_0:.+]] = aie.objectfifo.acquire @[[OBJ1]](Consume, 1)
-// CHECK:         %[[ACCESS_0:.+]] = aie.objectfifo.subview.access %[[ACQUIRE_0]]
-// CHECK:         %[[REINTERPRET_0:.+]] = memref.reinterpret_cast %[[ACCESS_0]]
-// CHECK:         scf.for %{{.+}} = %[[C0]] to %[[C8]] step %[[C1]]
-// CHECK:           linalg.fill
-// CHECK-SAME:      %[[REINTERPRET_0]]
-// CHECK:         }
-// CHECK:         aie.objectfifo.release
-// CHECK-SAME:    @[[OBJ1]]
-// CHECK:       aie.core(%[[TILE_1_2]])
-// CHECK:         %[[ACQUIRE_1:.+]] = aie.objectfifo.acquire @[[OBJ1]](Consume, 1)
-// CHECK:         %[[ACCESS_1:.+]] = aie.objectfifo.subview.access %[[ACQUIRE_1]]
-// CHECK:         %[[REINTERPRET_1:.+]] = memref.reinterpret_cast %[[ACCESS_1]]
-// CHECK:         scf.for %{{.+}} = %[[C0]] to %[[C8]] step %[[C1]]
-// CHECK:           linalg.fill
-// CHECK-SAME:      %[[REINTERPRET_1]]
-// CHECK:         }
-// CHECK:         aie.objectfifo.release
-// CHECK-SAME:    @[[OBJ1]]
-// CHECK:       aiex.runtime_sequence @large_example
-// CHECK-SAME:  %[[ARG0:.+]]: memref<32x64xi32>
-// CHECK:         aiex.npu.dma_memcpy_nd
-// CHECK-SAME:    %[[ARG0]]
-// CHECK-SAME:    [0, 0, 0, 32]
-// CHECK-SAME:    [1, 1, 32, 32]
-// CHECK-SAME:    [0, 0, 64, 1]
-// CHECK-SAME:    issue_token = true
-// CHECK-SAME:    @[[OBJ0]]
-// CHECK-NEXT:    aiex.npu.dma_wait
-// CHECK-SAME:    @[[OBJ0]]
+// CHECK:   aie.device
+// CHECK:     %[[TILE_0_2:.*]] = aie.tile(0, 2)
+// CHECK:     %[[C0_I32:.*]] = arith.constant 0 : i32
+// CHECK:     %[[BUFFER_0_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_0"} : memref<4096xi32, 2 : i32>
+// CHECK:     %[[LOCK_0_2:.*]] = aie.lock(%[[TILE_0_2]], 0) {init = 1 : i8, sym_name = "lock_0"}
+// CHECK:     %[[LOCK_0_2_0:.*]] = aie.lock(%[[TILE_0_2]], 1) {init = 0 : i8, sym_name = "lock_1"}
+// CHECK:     %[[CORE_0_2:.*]] = aie.core(%[[TILE_0_2]]) {
+// CHECK:       aie.use_lock(%[[LOCK_0_2_0]], AcquireGreaterEqual, 1)
+// CHECK:       %[[REINTERPRET_CAST:.*]] = memref.reinterpret_cast %[[BUFFER_0_2]] to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32>
+// CHECK:       linalg.fill ins(%[[C0_I32]] : i32) outs(%[[REINTERPRET_CAST]] : memref<64x64xi32, 2 : i32>)
+// CHECK:       aie.use_lock(%[[LOCK_0_2]], Release, 1)
+// CHECK:       aie.end
+// CHECK:     }
+// CHECK:     aiex.runtime_sequence @core_acquire_release
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @core_acquire_release() {
+    amdaie.workgroup {
+      %c0_i32 = arith.constant 0 : i32
+      %c0 = arith.constant 0 : index
+      %c2 = arith.constant 2 : index
+      %tile_0_2 = amdaie.tile(%c0, %c2)
+      %buffer = amdaie.buffer(%tile_0_2) : memref<4096xi32, 2 : i32>
+      %lock = amdaie.lock(%tile_0_2(0), 1)
+      %lock_1 = amdaie.lock(%tile_0_2(1), 0)
+      %core_0_0 = amdaie.core(%tile_0_2, in : [], out : []) {
+        amdaie.use_lock(%lock_1, AcquireGreaterOrEqual(1))
+        %3 = memref.reinterpret_cast %buffer to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32>
+        linalg.fill ins(%c0_i32 : i32) outs(%3 : memref<64x64xi32, 2 : i32>)
+        amdaie.use_lock(%lock, Release(1))
+        amdaie.end
+      }
+      amdaie.controlcode {
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK:   aie.device
+// CHECK:     func.func private @ukernel_B(memref<i32, 2 : i32>, index, memref<f32, 2 : i32>, index) attributes {llvm.bareptr = true}
+// CHECK:     func.func private @ukernel_A(memref<i32, 2 : i32>, index) attributes {llvm.bareptr = true}
+// CHECK:     %[[TILE_0_2:.*]] = aie.tile(0, 2)
+// CHECK:     %[[BUFFER_0_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_0"} : memref<4096xi32, 2 : i32>
+// CHECK:     %[[LOCK_0_2:.*]] = aie.lock(%[[TILE_0_2]], 0) {init = 1 : i8, sym_name = "lock_0"}
+// CHECK:     %[[LOCK_0_2_0:.*]] = aie.lock(%[[TILE_0_2]], 1) {init = 0 : i8, sym_name = "lock_1"}
+// CHECK:     %[[BUFFER_0_2_1:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_1"} : memref<4096xf32, 2 : i32>
+// CHECK:     %[[LOCK_0_2_2:.*]] = aie.lock(%[[TILE_0_2]], 2) {init = 1 : i8, sym_name = "lock_2"}
+// CHECK:     %[[LOCK_0_2_3:.*]] = aie.lock(%[[TILE_0_2]], 3) {init = 0 : i8, sym_name = "lock_3"}
+// CHECK:     %[[CORE_0_2:.*]] = aie.core(%[[TILE_0_2]]) {
+// CHECK:       aie.use_lock(%[[LOCK_0_2_0]], AcquireGreaterEqual, 1)
+// CHECK:       %[[REINTERPRET_CAST:.*]] = memref.reinterpret_cast %[[BUFFER_0_2]] to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32>
+// CHECK:       aie.use_lock(%[[LOCK_0_2_3]], AcquireGreaterEqual, 1)
+// CHECK:       %[[REINTERPRET_CAST_4:.*]] = memref.reinterpret_cast %[[BUFFER_0_2_1]] to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xf32, 2 : i32> to memref<64x64xf32, 2 : i32>
+// CHECK:       %[[BASE_BUFFER:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[REINTERPRET_CAST]] : memref<64x64xi32, 2 : i32> -> memref<i32, 2 : i32>, index, index, index, index, index
+// CHECK:       %[[BASE_BUFFER_5:.*]], %[[OFFSET_6:.*]], %[[SIZES_7:.*]]:2, %[[STRIDES_8:.*]]:2 = memref.extract_strided_metadata %[[REINTERPRET_CAST_4]] : memref<64x64xf32, 2 : i32> -> memref<f32, 2 : i32>, index, index, index, index, index
+// CHECK:       func.call @ukernel_A(%[[BASE_BUFFER]], %[[C0]]) : (memref<i32, 2 : i32>, index) -> ()
+// CHECK:       func.call @ukernel_B(%[[BASE_BUFFER]], %[[C0]], %[[BASE_BUFFER_5]], %[[C0]]) : (memref<i32, 2 : i32>, index, memref<f32, 2 : i32>, index) -> ()
+// CHECK:       aie.use_lock(%[[LOCK_0_2]], Release, 1)
+// CHECK:       aie.use_lock(%[[LOCK_0_2_2]], Release, 1)
+// CHECK:       aie.end
+// CHECK:     } {link_with = "/path/to/ukernel.o"}
+// CHECK:     aiex.runtime_sequence @core_ukernel
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func private @ukernel_A(memref<i32, 2 : i32>, index) attributes {link_with = "/path/to/ukernel.o", llvm.bareptr = true}
+  func.func private @ukernel_B(memref<i32, 2 : i32>, index, memref<f32, 2 : i32>, index) attributes {link_with = "/path/to/ukernel.o", llvm.bareptr = true}
+  func.func @core_ukernel() {
+    amdaie.workgroup {
+      %c0 = arith.constant 0 : index
+      %c2 = arith.constant 2 : index
+      %tile_0_2 = amdaie.tile(%c0, %c2)
+      %buffer = amdaie.buffer(%tile_0_2) : memref<4096xi32, 2 : i32>
+      %lock = amdaie.lock(%tile_0_2(0), 1)
+      %lock_1 = amdaie.lock(%tile_0_2(1), 0)
+      %buffer_1 = amdaie.buffer(%tile_0_2) : memref<4096xf32, 2 : i32>
+      %lock_2 = amdaie.lock(%tile_0_2(2), 1)
+      %lock_3 = amdaie.lock(%tile_0_2(3), 0)
+      %core_0_0 = amdaie.core(%tile_0_2, in : [], out : []) {
+        amdaie.use_lock(%lock_1, AcquireGreaterOrEqual(1))
+        %3 = memref.reinterpret_cast %buffer to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32>
+        amdaie.use_lock(%lock_3, AcquireGreaterOrEqual(1))
+        %4 = memref.reinterpret_cast %buffer_1 to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xf32, 2 : i32> to memref<64x64xf32, 2 : i32>
+        %base_buffer, %offset, %sizes:2, %strides:2 = memref.extract_strided_metadata %3 : memref<64x64xi32, 2 : i32> -> memref<i32, 2 : i32>, index, index, index, index, index
+        %base_buffer0, %offset0, %sizes0:2, %strides0:2 = memref.extract_strided_metadata %4 : memref<64x64xf32, 2 : i32> -> memref<f32, 2 : i32>, index, index, index, index, index
+        func.call @ukernel_A(%base_buffer, %c0) : (memref<i32, 2 : i32>, index) -> ()
+        func.call @ukernel_B(%base_buffer, %c0, %base_buffer0, %c0) : (memref<i32, 2 : i32>, index, memref<f32, 2 : i32>, index) -> ()
+        amdaie.use_lock(%lock, Release(1))
+        amdaie.use_lock(%lock_2, Release(1))
+        amdaie.end
+      } {link_with = "/path/to/ukernel.o"}
+      amdaie.controlcode {
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// Larger tests
+//===----------------------------------------------------------------------===//
+
+// CHECK:   aie.device(npu1_4col) {
+// CHECK:     memref.global "public" @[[SHIM_0:.+]] : memref<4096xi32>
+// CHECK:     %[[TILE_1_2:.*]] = aie.tile(1, 2)
+// CHECK:     %[[TILE_0_2:.*]] = aie.tile(0, 2)
+// CHECK:     %[[TILE_0_1:.*]] = aie.tile(0, 1)
+// CHECK:     %[[TILE_0_0:.*]] = aie.tile(0, 0)
+// CHECK:     %[[C0_I32:.*]] = arith.constant 0 : i32
+// CHECK:     %[[BUFFER_0_1:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_0"} : memref<4096xi32, 1 : i32>
+// CHECK:     %[[BUFFER_0_1_0:.*]] = aie.buffer(%[[TILE_0_1]]) {sym_name = "buff_1"} : memref<4096xi32, 1 : i32>
+// CHECK:     %[[LOCK_0_1:.*]] = aie.lock(%[[TILE_0_1]], 0) {init = 2 : i8, sym_name = "lock_0"}
+// CHECK:     %[[LOCK_0_1_1:.*]] = aie.lock(%[[TILE_0_1]], 1) {init = 0 : i8, sym_name = "lock_1"}
+// CHECK:     %[[BUFFER_0_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_2"} : memref<4096xi32, 2 : i32>
+// CHECK:     %[[BUFFER_0_2_2:.*]] = aie.buffer(%[[TILE_0_2]]) {sym_name = "buff_3"} : memref<4096xi32, 2 : i32>
+// CHECK:     %[[LOCK_0_2:.*]] = aie.lock(%[[TILE_0_2]], 0) {init = 2 : i8, sym_name = "lock_2"}
+// CHECK:     %[[LOCK_0_2_3:.*]] = aie.lock(%[[TILE_0_2]], 1) {init = 0 : i8, sym_name = "lock_3"}
+// CHECK:     %[[BUFFER_1_2:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "buff_4"} : memref<4096xi32, 2 : i32>
+// CHECK:     %[[BUFFER_1_2_4:.*]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "buff_5"} : memref<4096xi32, 2 : i32>
+// CHECK:     %[[LOCK_1_2:.*]] = aie.lock(%[[TILE_1_2]], 0) {init = 2 : i8, sym_name = "lock_4"}
+// CHECK:     %[[LOCK_1_2_5:.*]] = aie.lock(%[[TILE_1_2]], 1) {init = 0 : i8, sym_name = "lock_5"}
+// CHECK:     aie.flow(%[[TILE_0_0]], DMA : 0, %[[TILE_0_1]], DMA : 0)
+// CHECK:     aie.shim_dma_allocation @[[SHIM_0]](MM2S, 0, 0)
+// CHECK:     aie.flow(%[[TILE_0_1]], DMA : 1, %[[TILE_0_2]], DMA : 0)
+// CHECK:     aie.flow(%[[TILE_0_1]], DMA : 1, %[[TILE_1_2]], DMA : 0)
+// CHECK:     %[[MEMTILE_DMA_0_1:.*]] = aie.memtile_dma(%[[TILE_0_1]]) {
+// CHECK:       %[[VAL_0:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
+// CHECK:     ^bb1:
+// CHECK:       aie.use_lock(%[[LOCK_0_1]], AcquireGreaterEqual, 1)
+// CHECK:       aie.dma_bd(%[[BUFFER_0_1]] : memref<4096xi32, 1 : i32>) {dimensions = #aie<bd_dim_layout_array[<size = 64, stride = 32>, <size = 64, stride = 1>]>, len = 4096 : i32}
+// CHECK:       aie.use_lock(%[[LOCK_0_1_1]], Release, 1)
+// CHECK:       aie.next_bd ^bb2
+// CHECK:     ^bb2:
+// CHECK:       aie.use_lock(%[[LOCK_0_1]], AcquireGreaterEqual, 1)
+// CHECK:       aie.dma_bd(%[[BUFFER_0_1_0]] : memref<4096xi32, 1 : i32>) {dimensions = #aie<bd_dim_layout_array[<size = 64, stride = 32>, <size = 64, stride = 1>]>, len = 4096 : i32}
+// CHECK:       aie.use_lock(%[[LOCK_0_1_1]], Release, 1)
+// CHECK:       aie.next_bd ^bb1
+// CHECK:     ^bb3:
+// CHECK:       %[[VAL_1:.*]] = aie.dma_start(MM2S, 1, ^bb4, ^bb6)
+// CHECK:     ^bb4:
+// CHECK:       aie.use_lock(%[[LOCK_0_1_1]], AcquireGreaterEqual, 1)
+// CHECK:       aie.dma_bd(%[[BUFFER_0_1]] : memref<4096xi32, 1 : i32>) {dimensions = #aie<bd_dim_layout_array[<size = 64, stride = 32>, <size = 64, stride = 1>]>, len = 4096 : i32, offset = 1024 : i32}
+// CHECK:       aie.use_lock(%[[LOCK_0_1]], Release, 1)
+// CHECK:       aie.next_bd ^bb5
+// CHECK:     ^bb5:
+// CHECK:       aie.use_lock(%[[LOCK_0_1_1]], AcquireGreaterEqual, 1)
+// CHECK:       aie.dma_bd(%[[BUFFER_0_1_0]] : memref<4096xi32, 1 : i32>) {dimensions = #aie<bd_dim_layout_array[<size = 64, stride = 32>, <size = 64, stride = 1>]>, len = 4096 : i32, offset = 1024 : i32}
+// CHECK:       aie.use_lock(%[[LOCK_0_1]], Release, 1)
+// CHECK:       aie.next_bd ^bb4
+// CHECK:     ^bb6:
+// CHECK:       aie.end
+// CHECK:     }
+// CHECK:     %[[MEM_0_2:.*]] = aie.mem(%[[TILE_0_2]]) {
+// CHECK:       %[[VAL_2:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
+// CHECK:     ^bb1:
+// CHECK:       aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1)
+// CHECK:       aie.dma_bd(%[[BUFFER_0_2]] : memref<4096xi32, 2 : i32>) {len = 0 : i32}
+// CHECK:       aie.use_lock(%[[LOCK_0_2_3]], Release, 1)
+// CHECK:       aie.next_bd ^bb2
+// CHECK:     ^bb2:
+// CHECK:       aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1)
+// CHECK:       aie.dma_bd(%[[BUFFER_0_2_2]] : memref<4096xi32, 2 : i32>) {len = 0 : i32}
+// CHECK:       aie.use_lock(%[[LOCK_0_2_3]], Release, 1)
+// CHECK:       aie.next_bd ^bb1
+// CHECK:     ^bb3:
+// CHECK:       aie.end
+// CHECK:     }
+// CHECK:     %[[MEM_1_2:.*]] = aie.mem(%[[TILE_1_2]]) {
+// CHECK:       %[[VAL_3:.*]] = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
+// CHECK:     ^bb1:
+// CHECK:       aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1)
+// CHECK:       aie.dma_bd(%[[BUFFER_1_2]] : memref<4096xi32, 2 : i32>) {len = 0 : i32}
+// CHECK:       aie.use_lock(%[[LOCK_1_2_5]], Release, 1)
+// CHECK:       aie.next_bd ^bb2
+// CHECK:     ^bb2:
+// CHECK:       aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1)
+// CHECK:       aie.dma_bd(%[[BUFFER_1_2_4]] : memref<4096xi32, 2 : i32>) {len = 0 : i32}
+// CHECK:       aie.use_lock(%[[LOCK_1_2_5]], Release, 1)
+// CHECK:       aie.next_bd ^bb1
+// CHECK:     ^bb3:
+// CHECK:       aie.end
+// CHECK:     }
+// CHECK:     %[[CORE_0_2:.*]] = aie.core(%[[TILE_0_2]]) {
+// CHECK:       aie.use_lock(%[[LOCK_0_2_3]], AcquireGreaterEqual, 1)
+// CHECK:       %[[REINTERPRET_CAST:.*]] = memref.reinterpret_cast %[[BUFFER_0_2]] to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32>
+// CHECK:       linalg.fill ins(%[[C0_I32]] : i32) outs(%[[REINTERPRET_CAST]] : memref<64x64xi32, 2 : i32>)
+// CHECK:       aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1)
+// CHECK:       aie.use_lock(%[[LOCK_0_2_3]], AcquireGreaterEqual, 1)
+// CHECK:       %[[REINTERPRET_CAST_6:.*]] = memref.reinterpret_cast %[[BUFFER_0_2_2]] to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32>
+// CHECK:       linalg.fill ins(%[[C0_I32]] : i32) outs(%[[REINTERPRET_CAST_6]] : memref<64x64xi32, 2 : i32>)
+// CHECK:       aie.use_lock(%[[LOCK_0_2]], AcquireGreaterEqual, 1)
+// CHECK:       aie.end
+// CHECK:     }
+// CHECK:     %[[CORE_1_2:.*]] = aie.core(%[[TILE_1_2]]) {
+// CHECK:       aie.use_lock(%[[LOCK_1_2_5]], AcquireGreaterEqual, 1)
+// CHECK:       %[[REINTERPRET_CAST:.*]] = memref.reinterpret_cast %[[BUFFER_1_2]] to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32>
+// CHECK:       linalg.fill ins(%[[C0_I32]] : i32) outs(%[[REINTERPRET_CAST]] : memref<64x64xi32, 2 : i32>)
+// CHECK:       aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1)
+// CHECK:       aie.use_lock(%[[LOCK_1_2_5]], AcquireGreaterEqual, 1)
+// CHECK:       %[[REINTERPRET_CAST_6:.*]] = memref.reinterpret_cast %[[BUFFER_1_2_4]] to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32>
+// CHECK:       linalg.fill ins(%[[C0_I32]] : i32) outs(%[[REINTERPRET_CAST_6]] : memref<64x64xi32, 2 : i32>)
+// CHECK:       aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1)
+// CHECK:       aie.end
+// CHECK:     }
+// CHECK:     aiex.runtime_sequence @large_example(%[[ARG0:.*]]: memref<4096xi32>) {
+// CHECK:       aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 32][1, 1, 32, 32][0, 0, 64, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_0]]} : memref<4096xi32>
+// CHECK:       aiex.npu.dma_wait {symbol = @[[SHIM_0]]}
+// CHECK:     }
+// CHECK:   }
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>]>
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
-#pipeline_layout = #hal.pipeline.layout<bindings = [<storage_buffer>]>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @large_example() {
-    %c8 = arith.constant 8 : index
-    %c2 = arith.constant 2 : index
-    %c1 = arith.constant 1 : index
-    %c0_i32 = arith.constant 0 : i32
-    %c0 = arith.constant 0 : index
     amdaie.workgroup {
+      %c0_i32 = arith.constant 0 : i32
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c2 = arith.constant 2 : index
+      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<4096xi32>
       %tile = amdaie.tile(%c0, %c0)
       %tile_0 = amdaie.tile(%c0, %c1)
       %tile_1 = amdaie.tile(%c0, %c2)
       %tile_2 = amdaie.tile(%c1, %c2)
       %bd_id = amdaie.bd_id(%tile, 0)
-      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<32x64xi32>
-      memref.assume_alignment %0, 64 : memref<32x64xi32>
-      %alloc = memref.alloc() : memref<32x32xi32, 1>
-      %alloc_3 = memref.alloc() : memref<4x8x4x8xi32, 2>
-      %1 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo<memref<2048xi32>>
-      %2 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_0} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
-      %3 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_1, %tile_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-      %4 = amdaie.connection(%2, %1) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<2048xi32>>)
-      %5 = amdaie.connection(%3, %2) : (!amdaie.logicalobjectfifo<memref<1024xi32, 2>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
-      amdaie.logicalobjectfifo.link[%4] -> [%5] ()
+      %buffer = amdaie.buffer(%tile_0) : memref<4096xi32, 1 : i32>
+      %buffer_3 = amdaie.buffer(%tile_0) : memref<4096xi32, 1 : i32>
+      %lock = amdaie.lock(%tile_0(0), 2)
+      %lock_4 = amdaie.lock(%tile_0(1), 0)
+      %buffer_5 = amdaie.buffer(%tile_1) : memref<4096xi32, 2 : i32>
+      %buffer_6 = amdaie.buffer(%tile_1) : memref<4096xi32, 2 : i32>
+      %lock_7 = amdaie.lock(%tile_1(0), 2)
+      %lock_8 = amdaie.lock(%tile_1(1), 0)
+      %buffer_9 = amdaie.buffer(%tile_2) : memref<4096xi32, 2 : i32>
+      %buffer_10 = amdaie.buffer(%tile_2) : memref<4096xi32, 2 : i32>
+      %lock_11 = amdaie.lock(%tile_2(0), 2)
+      %lock_12 = amdaie.lock(%tile_2(1), 0)
+      %1 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo<memref<4096xi32>>
+      %2 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_3}, {%lock}, {%lock_4}) : memref<4096xi32, 1 : i32>, memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 2>
+      %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_5, %buffer_6, %buffer_9, %buffer_10}, {%lock_7, %lock_11}, {%lock_8, %lock_12}) : memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 2>
+      %channel = amdaie.channel(%tile, 0)
+      %channel_13 = amdaie.channel(%tile_0, 0)
+      %4 = amdaie.connection(%2 {%channel_13}, %1 {%channel}) : (!amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<4096xi32>>)
+      %channel_14 = amdaie.channel(%tile_0, 1)
+      %channel_15 = amdaie.channel(%tile_1, 0)
+      %channel_16 = amdaie.channel(%tile_2, 0)
+      %5 = amdaie.connection(%3 {%channel_15, %channel_16}, %2 {%channel_14}) : (!amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 2>, !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 2>)
       %6 = amdaie.core(%tile_1, in : [%5], out : []) {
-        %8 = amdaie.logicalobjectfifo.acquire(%5, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-        %9 = amdaie.logicalobjectfifo.access(%8, Read) : !amdaie.logicalobjectfifo<memref<1024xi32, 2>> -> memref<1024xi32, 2>
-        %reinterpret_cast = memref.reinterpret_cast %9 to offset: [0], sizes: [4, 8, 4, 8], strides: [256, 32, 8, 1] : memref<1024xi32, 2> to memref<4x8x4x8xi32, 2>
-        scf.for %arg0 = %c0 to %c8 step %c1 {
-          linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast : memref<4x8x4x8xi32, 2>)
-        }
-        amdaie.logicalobjectfifo.release(%5, Consume) {size = 1 : i32}
+        amdaie.use_lock(%lock_8, AcquireGreaterOrEqual(1))
+        %reinterpret_cast = memref.reinterpret_cast %buffer_5 to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32>
+        linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast : memref<64x64xi32, 2 : i32>)
+        amdaie.use_lock(%lock_7, AcquireGreaterOrEqual(1))
+        amdaie.use_lock(%lock_8, AcquireGreaterOrEqual(1))
+        %reinterpret_cast_17 = memref.reinterpret_cast %buffer_6 to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32>
+        linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast_17 : memref<64x64xi32, 2 : i32>)
+        amdaie.use_lock(%lock_7, AcquireGreaterOrEqual(1))
         amdaie.end
       }
       %7 = amdaie.core(%tile_2, in : [%5], out : []) {
-        %8 = amdaie.logicalobjectfifo.acquire(%5, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
-        %9 = amdaie.logicalobjectfifo.access(%8, Read) : !amdaie.logicalobjectfifo<memref<1024xi32, 2>> -> memref<1024xi32, 2>
-        %reinterpret_cast = memref.reinterpret_cast %9 to offset: [0], sizes: [4, 8, 4, 8], strides: [256, 32, 8, 1] : memref<1024xi32, 2> to memref<4x8x4x8xi32, 2>
-        scf.for %arg0 = %c0 to %c8 step %c1 {
-          linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast : memref<4x8x4x8xi32, 2>)
-        }
-        amdaie.logicalobjectfifo.release(%5, Consume) {size = 1 : i32}
+        amdaie.use_lock(%lock_12, AcquireGreaterOrEqual(1))
+        %reinterpret_cast = memref.reinterpret_cast %buffer_9 to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32>
+        linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast : memref<64x64xi32, 2 : i32>)
+        amdaie.use_lock(%lock_11, AcquireGreaterOrEqual(1))
+        amdaie.use_lock(%lock_12, AcquireGreaterOrEqual(1))
+        %reinterpret_cast_17 = memref.reinterpret_cast %buffer_10 to offset: [0], sizes: [64, 64], strides: [64, 1] : memref<4096xi32, 2 : i32> to memref<64x64xi32, 2 : i32>
+        linalg.fill ins(%c0_i32 : i32) outs(%reinterpret_cast_17 : memref<64x64xi32, 2 : i32>)
+        amdaie.use_lock(%lock_11, AcquireGreaterOrEqual(1))
         amdaie.end
       }
-      memref.dealloc %alloc_3 : memref<4x8x4x8xi32, 2>
-      memref.dealloc %alloc : memref<32x32xi32, 1>
       amdaie.controlcode {
-        %8 = amdaie.npu.circular_dma_cpy_nd %4([] [] [], [] [] [])
-        %9 = amdaie.npu.circular_dma_cpy_nd %5([] [] [], [] [] [])
-        %10 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<32x64xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
-        %11 = amdaie.npu.dma_cpy_nd %4([] [] [], %10[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %8 = amdaie.npu.circular_dma_cpy_nd %4([0, 0] [64, 64] [32, 1], [] [] [])
+        %9 = amdaie.npu.circular_dma_cpy_nd %5([] [] [], [0, 1024] [64, 64] [32, 1])
+        %10 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<4096xi32> -> !amdaie.logicalobjectfifo<memref<4096xi32>>
+        %11 = amdaie.npu.dma_cpy_nd %4([] [] [], %10[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo<memref<4096xi32>>
         amdaie.npu.dma_wait(%11, MM2S)
         amdaie.end
       }
@@ -748,4 +1142,3 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     return
   }
 }
-
diff --git a/runtime/src/iree-amd-aie/aie_runtime/Utils/CMakeLists.txt b/runtime/src/iree-amd-aie/aie_runtime/Utils/CMakeLists.txt
index 4a66d5863..4a909e30a 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/Utils/CMakeLists.txt
+++ b/runtime/src/iree-amd-aie/aie_runtime/Utils/CMakeLists.txt
@@ -24,5 +24,6 @@ iree_cc_library(
     MLIRIR
     MLIRParser
     MLIRSupport
+    iree-amd-aie::aie_runtime::iree_aie_runtime_static
   PUBLIC
 )