From 7d4f4180dbaa916ce1844203b86f6c04638ad25b Mon Sep 17 00:00:00 2001
From: Jorn Tuyls <jorn.tuyls@gmail.com>
Date: Wed, 7 Aug 2024 17:08:17 -0700
Subject: [PATCH] Replace logical objectfifo produce/consume with core in/out
 DMA operands

---
 .../AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp     |  15 +-
 .../AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td      |  96 +-------
 .../AMD-AIE/iree-amd-aie/IR/test/invalid.mlir |   2 +-
 .../iree-amd-aie/IR/test/roundtrip.mlir       |  30 +--
 .../Transforms/AMDAIECreateAIEWorkgroup.cpp   |  37 ++-
 .../Transforms/AMDAIECreateAIEWorkgroup.h     |   7 +-
 .../AMDAIEDistributeCoresAndObjectFifos.cpp   |  35 +--
 .../Transforms/AMDAIEInsertCores.cpp          |  54 +++--
 .../Transforms/AMDAIELowerToAIE.cpp           |  10 -
 .../test/access_to_acquire_release.mlir       |  28 +--
 .../test/canonicalize_doubly_strided_op.mlir  |  36 +--
 .../Transforms/test/create_aie_workgroup.mlir |  72 +++---
 .../distribute_cores_and_objectfifos.mlir     | 211 +++++++-----------
 .../test/flatten_logical_objectfifo.mlir      |   2 +-
 .../Transforms/test/insert_cores.mlir         |  17 +-
 .../Transforms/test/lower_to_aie.mlir         |  14 +-
 .../test/none_access_to_temporary_buffer.mlir |  12 +-
 17 files changed, 260 insertions(+), 418 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
index 1c6e4866e..309e610fb 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
@@ -52,16 +52,27 @@ LogicalResult ControlCodeOp::verify() {
 // AMDAIE_CoreOp
 //===----------------------------------------------------------------------===//
 
+
+void CoreOp::build(OpBuilder &b, OperationState &result, AMDAIE::TileOp tileOp,
+                   ValueRange inputDmas, ValueRange outputDmas) {
+  build(b, result, b.getIndexType(), tileOp, inputDmas, outputDmas, nullptr);
+}
+
 /// Hardcoded row_offset == 2 -> AIE core rows start from 2
 /// TODO(jornt): avoid hardcoding here. Add a device model/identifier to loop up
 /// core offset. This will be handled in a follow-up.
 void CoreOp::build(OpBuilder &b, OperationState &result, Value coreCol,
-                   Value coreRow) {
+                   Value coreRow, ValueRange inputDmas, ValueRange outputDmas) {
   auto rowOffset = b.create<arith::ConstantIndexOp>(b.getUnknownLoc(), 2);
   auto row =
       b.createOrFold<arith::AddIOp>(b.getUnknownLoc(), rowOffset, coreRow);
   auto tileOp = b.create<AMDAIE::TileOp>(b.getUnknownLoc(), coreCol, row);
-  build(b, result, b.getIndexType(), tileOp, nullptr);
+  build(b, result, tileOp, inputDmas, outputDmas, nullptr);
+}
+
+void CoreOp::build(OpBuilder &b, OperationState &result, Value coreCol,
+                   Value coreRow) {
+  build(b, result, coreCol, coreRow, {}, {});
 }
 
 LogicalResult CoreOp::verify() {
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
index 2d5a729b1..5c897859a 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
@@ -47,7 +47,7 @@ def AMDAIE_ControlCodeOp : AMDAIE_Op<"controlcode", [HasParent<"WorkgroupOp">,
   let hasVerifier = 1;
 }
 
-def AMDAIE_CoreOp: AMDAIE_Op<"core", [SingleBlock]>, Results<(outs Index)> {
+def AMDAIE_CoreOp: AMDAIE_Op<"core", [SingleBlock, AttrSizedOperandSegments]>, Results<(outs Index)> {
   let summary = "The AIE core operator";
   let description = [{
     This operation represents an AIE core op, containing a sequence of operations
@@ -62,15 +62,20 @@ def AMDAIE_CoreOp: AMDAIE_Op<"core", [SingleBlock]>, Results<(outs Index)> {
 
   let arguments = (
     ins Index:$tile,
-    OptionalAttr<StrAttr>:$link_with
+        Variadic<Index>:$input_dmas,
+        Variadic<Index>:$output_dmas,
+        OptionalAttr<StrAttr>:$link_with
   );
 
   let regions = (region SizedRegion<1>:$region);
   
-  let assemblyFormat = [{ `(` $tile `)` regions attr-dict }];
+  let assemblyFormat = [{ `(` $tile `,` `in` `:` `[` $input_dmas `]` `,` `out` `:` `[` $output_dmas `]` `)` regions attr-dict }];
   
   let builders = [
     OpBuilder<(ins "mlir::Value":$coreCol, "mlir::Value":$coreRow)>,
+    OpBuilder<(ins "mlir::Value":$coreCol, "mlir::Value":$coreRow,
+      "ValueRange":$input_dmas, "ValueRange":$output_dmas)>,
+    OpBuilder<(ins "TileOp":$tile, "ValueRange":$input_dmas, "ValueRange":$output_dmas)>
   ];
 
   let extraClassDeclaration = [{
@@ -438,7 +443,7 @@ def AMDAIE_LogicalObjectFifoAccessOp : AMDAIE_Op<"logicalobjectfifo.access"> {
       %alloc = memref.alloc() : memref<8x16xi32, 2>
       %0 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<8x16xi32, 2>
         -> !amdaie.logicalobjectfifo<memref<8x16xi32, 2>>
-      %core = amdaie.core(%tile) {
+      %core = amdaie.core(%tile, in : [], out : []) {
         %1 = amdaie.logicalobjectfifo.access(%0, Read) : 
           !amdaie.logicalobjectfifo<memref<8x16xi32, 2>> ->  memref<8x16xi32, 2>
     ```
@@ -511,47 +516,6 @@ def AMDAIE_LogicalObjectFifoAcquire:
   ];
 }
 
-def AMDAIE_LogicalObjectFifoConsume: AMDAIE_Op<"logicalobjectfifo.consume", []> {
-  let summary = "Consume a DMA logical objectFifo result.";
-  let description = [{
-    Consumes the result of a DMA operation. This is a blocking operation,
-    waiting for the DMA to produce data. Typically, this operation will reside
-    inside a `CoreOp` to synchronize with external DMA operations producing data
-    into the respective core's local memory.
-
-    Example:
-    ```mlir
-    %2 = amdaie.dma_cpy_nd(
-      %1[%c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16] [%c128, %c128, %c16, %c1],
-      %0[%c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16] [%c128, %c16, %c16, %c1])
-      : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>,
-      !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-    %3 = amdaie.logicalobjectfifo.consume(%2)
-    ```
-  }];
-
-  let arguments = (
-    ins Index:$dma
-  );
-
-  let assemblyFormat = [{
-    `(` $dma `)`  attr-dict
-  }];
-
-  let extraClassDeclaration = [{
-    DmaCpyNdOp getDmaCpyNdOp() {
-      return dyn_cast<DmaCpyNdOp>(getDma().getDefiningOp());
-    }
-    Value getLogicalObjectfifo() {
-      return dyn_cast<DmaCpyNdOp>(getDma().getDefiningOp()).getTarget();
-    }
-    // Return the port of this operation.
-    LogicalObjectFifoPort getPort() {
-      return LogicalObjectFifoPort::Consume;
-    }
-  }];
-}
-
 def AMDAIE_LogicalObjectFifoFromMemrefOp
     : AMDAIE_Op<"logicalobjectfifo.from_memref", [Pure]> {
   let summary = "Create a logical objectFifo from a memref";
@@ -654,48 +618,6 @@ def AMDAIE_LogicalObjectFifoLink
   }];
 }
 
-def AMDAIE_LogicalObjectFifoProduce: AMDAIE_Op<"logicalobjectfifo.produce", []> {
-  let summary = "Produce a DMA logicalobjectfifo input.";
-  let description = [{
-    Produces the input of a DMA operation. This is a release-type operation,
-    where the DMA will be waiting for the data to be produced. Typically, this
-    operation will reside inside a `CoreOp` to synchronize with external DMA
-    operations waiting for data from the respective core's local memory to be
-    released.
-
-    Example:
-    ```mlir
-    %2 = amdaie.dma_cpy_nd(
-      %1[%c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16] [%c128, %c128, %c16, %c1],
-      %0[%c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16] [%c128, %c16, %c16, %c1])
-      : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>,
-      !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-    %3 = amdaie.logicalobjectfifo.produce(%2)
-    ```
-  }];
-
-  let arguments = (
-    ins Index:$dma
-  );
-  
-  let assemblyFormat = [{
-    `(` $dma `)`  attr-dict
-  }];
-  
-  let extraClassDeclaration = [{
-    DmaCpyNdOp getDmaCpyNdOp() {
-      return dyn_cast<DmaCpyNdOp>(getDma().getDefiningOp());
-    }
-    Value getLogicalObjectfifo() {
-      return dyn_cast<DmaCpyNdOp>(getDma().getDefiningOp()).getSource();
-    }
-    // Return the port of this operation.
-    LogicalObjectFifoPort getPort() {
-      return LogicalObjectFifoPort::Produce;
-    }
-  }];
-}
-
 def AMDAIE_LogicalObjectFifoRelease: 
     AMDAIE_Op<"logicalobjectfifo.release", []> {
   let summary = "Semaphore operation to release objects from a logical"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/invalid.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/invalid.mlir
index 195f90adc..589f103bc 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/invalid.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/invalid.mlir
@@ -6,7 +6,7 @@ func.func @core_invalid_terminator() {
   %tile = amdaie.tile(%c0, %c0)
   // expected-note @+2 {{in custom textual format, the absence of terminator implies 'amdaie.end'}}
   // expected-error @+1 {{'amdaie.core' op expects regions to end with 'amdaie.end', found 'arith.constant'}}
-  %core = amdaie.core(%tile) {
+  %core = amdaie.core(%tile, in : [], out : []) {
     %c1 = arith.constant 0 : index
   }
   return
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir
index 82b20c05c..d5d85ede0 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir
@@ -16,12 +16,12 @@ func.func @bd_id() {
 // CHECK-LABEL: func.func @core
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[TILE_0:.*]] = amdaie.tile(%[[C0]], %[[C0]])
-// CHECK: %[[CORE_0:.*]] = amdaie.core(%[[TILE_0]])
+// CHECK: %[[CORE_0:.*]] = amdaie.core(%[[TILE_0]], in : [], out : [])
 // CHECK: amdaie.end
 func.func @core() {
   %c0 = arith.constant 0 : index
   %tile = amdaie.tile(%c0, %c0)
-  %core = amdaie.core(%tile) {
+  %core = amdaie.core(%tile, in : [], out : []) {
     amdaie.end
   }
   return
@@ -150,17 +150,6 @@ func.func @logicalobjectfifo_acquire(%arg0: !amdaie.logicalobjectfifo<memref<1x1
 
 // -----
 
-// CHECK-LABEL: func.func @logicalobjectfifo_consume
-// CHECK: amdaie.dma_cpy_nd
-// CHECK: amdaie.logicalobjectfifo.consume
-func.func @logicalobjectfifo_consume(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
-  %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-  amdaie.logicalobjectfifo.consume(%0)
-  return
-}
-
-// -----
-
 // CHECK-LABEL: func.func @logicalobjectfifo_link
 // CHECK:       %[[DMA0:.+]] = amdaie.circular_dma_cpy_nd
 // CHECK:       %[[DMA1:.+]] = amdaie.circular_dma_cpy_nd
@@ -176,17 +165,6 @@ func.func @logicalobjectfifo_link(%arg0: !amdaie.logicalobjectfifo<memref<32x102
 
 // -----
 
-// CHECK-LABEL: func.func @logicalobjectfifo_produce
-// CHECK: amdaie.dma_cpy_nd
-// CHECK: amdaie.logicalobjectfifo.produce
-func.func @logicalobjectfifo_produce(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
-  %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-  amdaie.logicalobjectfifo.produce(%0)
-  return
-}
-
-// -----
-
 // CHECK-LABEL: func.func @logicalobjectfifo_release
 // CHECK:       %[[DMA:.+]] = amdaie.dma_cpy_nd
 // CHECK:       amdaie.logicalobjectfifo.release
@@ -300,11 +278,11 @@ func.func @workgroup() {
   %c1 = arith.constant 1 : index
   amdaie.workgroup {
     %tile_0_0 = amdaie.tile(%c0, %c0)
-    %core_0 = amdaie.core(%tile_0_0) {
+    %core_0 = amdaie.core(%tile_0_0, in : [], out : []) {
       amdaie.end
     }
     %tile_0_1 = amdaie.tile(%c0, %c1)
-    %core_1 = amdaie.core(%tile_0_1) {
+    %core_1 = amdaie.core(%tile_0_1, in : [], out : []) {
       amdaie.end
     }
     amdaie.controlcode {
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp
index 0b0649eae..cc1888370 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp
@@ -18,16 +18,35 @@
 
 namespace mlir::iree_compiler::AMDAIE {
 
-/// Merge the 'source' core operations in the end of the 'dest' core operation.
-void CoreContext::mergeCoreOps(AMDAIE::CoreOp source, AMDAIE::CoreOp dest) {
+/// Merge the 'source' and 'dest' core operations into a new `amdaie.core`
+/// operation and combine the input and output DMAs.
+AMDAIE::CoreOp CoreContext::mergeCoreOps(AMDAIE::CoreOp source,
+                                         AMDAIE::CoreOp dest) {
   OpBuilder::InsertionGuard guard(rewriter);
-  Block::iterator insertIt = dest.getBody()->getTerminator()->getIterator();
-  Block::iterator sourceBegin = source.getBody()->begin();
-  Block::iterator sourceEnd = source.getBody()->getTerminator()->getIterator();
-  dest.getBody()->getOperations().splice(
-      insertIt, source.getBody()->getOperations(), sourceBegin, sourceEnd);
-  rewriter.moveOpBefore(dest, source);
-  rewriter.replaceOp(source, dest);
+  AMDAIE::TileOp tile = dest.getTileOp();
+  SmallVector<Value> sourceInputDmas = source.getInputDmas();
+  SmallVector<Value> destInputDmas = dest.getInputDmas();
+  llvm::SmallSetVector<Value, 4> inputDmas(destInputDmas.begin(),
+                                           destInputDmas.end());
+  inputDmas.insert(sourceInputDmas.begin(), sourceInputDmas.end());
+  SmallVector<Value> sourceOutputDmas = source.getOutputDmas();
+  SmallVector<Value> destOutputDmas = dest.getOutputDmas();
+  llvm::SmallSetVector<Value, 4> outputDmas(destOutputDmas.begin(),
+                                            destOutputDmas.end());
+  outputDmas.insert(sourceOutputDmas.begin(), sourceOutputDmas.end());
+  rewriter.setInsertionPoint(source);
+  auto newCoreOp = rewriter.create<AMDAIE::CoreOp>(rewriter.getUnknownLoc(),
+                                                   tile, inputDmas.takeVector(),
+                                                   outputDmas.takeVector());
+  Region &region = newCoreOp.getRegion();
+  Block *newBlock = rewriter.createBlock(&region);
+  rewriter.setInsertionPointToStart(newBlock);
+  rewriter.eraseOp(dest.getBody()->getTerminator());
+  rewriter.mergeBlocks(dest.getBody(), newBlock);
+  rewriter.mergeBlocks(source.getBody(), newBlock);
+  rewriter.eraseOp(dest);
+  rewriter.eraseOp(source);
+  return newCoreOp;
 }
 
 /// Clone CoreOp and add to or merge with coreContext.
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.h
index 445592647..5fddc61b6 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.h
@@ -168,7 +168,7 @@ class CoreContext {
     if (!existingCoreOp) {
       coreMap[coordinate] = coreOp;
     } else {
-      mergeCoreOps(coreOp, existingCoreOp);
+      coreMap[coordinate] = mergeCoreOps(coreOp, existingCoreOp);
     }
   }
 
@@ -179,9 +179,8 @@ class CoreContext {
   }
 
  private:
-  /// Merge the 'source' core operations in the end of the 'dest' core
-  /// operation.
-  void mergeCoreOps(AMDAIE::CoreOp source, AMDAIE::CoreOp dest);
+  /// Merge the 'source' and 'dest' core operations into a new one.
+  AMDAIE::CoreOp mergeCoreOps(AMDAIE::CoreOp source, AMDAIE::CoreOp dest);
 
   /// The rewriter to be used.
   IRRewriterAndMapper &rewriter;
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp
index bfb24ecba..77370de67 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp
@@ -527,25 +527,26 @@ LogicalResult insertLogicalObjectFifoAccess(ModuleOp moduleOp) {
     DenseMap<Value, std::tuple<AMDAIE::LogicalObjectFifoFromMemrefOp,
                                AMDAIE::MemoryAccess>>
         memrefToLogicalObjectFifo;
-    // First walk to collect consume/produce DMA accesses and map respective
-    // memrefs to logical objectifos.
-    coreOp->walk([&](Operation *op) {
-      // TODO(jornt): can we avoid produce/consume?
-      if (auto consumeOp = dyn_cast<AMDAIE::LogicalObjectFifoConsume>(op)) {
-        Value targetMemref =
-            consumeOp.getDmaCpyNdOp().getTargetObjectFifo().getMemref();
-        memrefToLogicalObjectFifo[targetMemref] =
-            std::make_pair(consumeOp.getDmaCpyNdOp().getTargetObjectFifo(),
-                           AMDAIE::MemoryAccess::Read);
-      } else if (auto produceOp =
-                     dyn_cast<AMDAIE::LogicalObjectFifoProduce>(op)) {
-        Value sourceMemref =
-            produceOp.getDmaCpyNdOp().getSourceObjectFifo().getMemref();
+
+    SmallVector<AMDAIE::DmaCpyNdOp> inputDmaOps =
+        llvm::map_to_vector(coreOp.getInputDmas(), [](Value inputDma) {
+          return cast<AMDAIE::DmaCpyNdOp>(inputDma.getDefiningOp());
+        });
+    for (AMDAIE::DmaCpyNdOp inputDmaOp : inputDmaOps) {
+      Value targetMemref = inputDmaOp.getTargetObjectFifo().getMemref();
+      memrefToLogicalObjectFifo[targetMemref] = std::make_pair(
+          inputDmaOp.getTargetObjectFifo(), AMDAIE::MemoryAccess::Read);
+    }
+    SmallVector<AMDAIE::DmaCpyNdOp> outputDmaOps =
+        llvm::map_to_vector(coreOp.getOutputDmas(), [](Value outputDma) {
+          return cast<AMDAIE::DmaCpyNdOp>(outputDma.getDefiningOp());
+        });
+    for (AMDAIE::DmaCpyNdOp outputDmaOp : outputDmaOps) {
+      Value sourceMemref = outputDmaOp.getSourceObjectFifo().getMemref();
         memrefToLogicalObjectFifo[sourceMemref] =
-            std::make_pair(produceOp.getDmaCpyNdOp().getSourceObjectFifo(),
+            std::make_pair(outputDmaOp.getSourceObjectFifo(),
                            AMDAIE::MemoryAccess::Write);
-      }
-    });
+    }
 
     // We maintain a map from AllocOp to LogicalObjectFifoAccessOp in order to
     // avoid creating a new LogicalObjectFifoAccessOp for the same AllocOp being
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertCores.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertCores.cpp
index 33d1ffbf1..25cc85c2a 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertCores.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertCores.cpp
@@ -71,45 +71,43 @@ LogicalResult insertCoreOps(mlir::ModuleOp moduleOp) {
     }
     Value threadX = attrMapping[gpu::threadX(forallOp->getContext())];
     Value threadY = attrMapping[gpu::threadY(forallOp->getContext())];
+
+    // Find input and output DMAs that need to be added to the core.
+    SmallVector<Value> inputDmas;
+    SmallVector<Value> outputDmas;
+    WalkResult dmaRes = forallOp->walk([&](AMDAIE::DmaCpyNdOp dmaOp) {
+      uint8_t sourceMemspace =
+          dmaOp.getSourceObjectFifo().getMemorySpaceAsUInt();
+      uint8_t targetMemspace =
+          dmaOp.getTargetObjectFifo().getMemorySpaceAsUInt();
+      if (sourceMemspace == 2 && targetMemspace == 2) {
+        dmaOp->emitOpError()
+            << "dma op with both source and target on L1 is not supported";
+        return WalkResult::interrupt();
+      } else if (sourceMemspace == 2) {
+        outputDmas.push_back(dmaOp);
+      } else if (targetMemspace == 2) {
+        inputDmas.push_back(dmaOp);
+      }
+      return WalkResult::advance();
+    });
+    if (dmaRes.wasInterrupted()) return WalkResult::interrupt();
+
     // Create CoreOp at the end of the innermost forall
     rewriter.setInsertionPoint(forallOp.getBody()->getTerminator());
-    auto coreOp = rewriter.create<AMDAIE::CoreOp>(rewriter.getUnknownLoc(),
-                                                  threadX, threadY);
+    auto coreOp = rewriter.create<AMDAIE::CoreOp>(
+        rewriter.getUnknownLoc(), threadX, threadY, inputDmas, outputDmas);
     Region &region = coreOp.getRegion();
     Block *newBlock = rewriter.createBlock(&region);
     rewriter.setInsertionPointToStart(newBlock);
     auto endOp = rewriter.create<AMDAIE::EndOp>(rewriter.getUnknownLoc());
 
     // Walk all operations in the workgroup and fill in the CoreOp with
-    // computational ops (linalg) and synchronization ops to synchronize
-    // with the workgroup DMA ops.
+    // computational ops.
     WalkResult forallRes = forallOp->walk([&](Operation *op) {
       // Skip operations already inside core ops
       if (op->getParentOfType<AMDAIE::CoreOp>()) return WalkResult::advance();
-      if (auto dmaOp = dyn_cast<AMDAIE::DmaCpyNdOp>(op)) {
-        auto sourceMemspace = dmaOp.getSourceObjectFifo().getMemorySpace();
-        auto targetMemspace = dmaOp.getTargetObjectFifo().getMemorySpace();
-        if (sourceMemspace &&
-            dyn_cast<IntegerAttr>(sourceMemspace).getInt() == 2 &&
-            targetMemspace &&
-            dyn_cast<IntegerAttr>(targetMemspace).getInt() == 2) {
-          dmaOp->emitOpError()
-              << "dma op with both source and target on L1 is not supported";
-          return WalkResult::interrupt();
-        } else if (sourceMemspace &&
-                   dyn_cast<IntegerAttr>(sourceMemspace).getInt() == 2) {
-          // From L1, so insert a logical objectFifo produce op
-          rewriter.setInsertionPoint(endOp);
-          rewriter.create<AMDAIE::LogicalObjectFifoProduce>(
-              rewriter.getUnknownLoc(), SmallVector<Type, 1>{}, dmaOp);
-        } else if (targetMemspace &&
-                   dyn_cast<IntegerAttr>(targetMemspace).getInt() == 2) {
-          // To L1, so insert a logical objectFifo consume op
-          rewriter.setInsertionPoint(endOp);
-          rewriter.create<AMDAIE::LogicalObjectFifoConsume>(
-              rewriter.getUnknownLoc(), SmallVector<Type, 1>{}, dmaOp);
-        }
-      } else if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
+      if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
         rewriter.setInsertionPoint(endOp);
         rewriter.moveOpBefore(linalgOp, endOp);
       } else if (isa<vector::ContractionOp>(op)) {
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp
index 8c1f21e48..da63935e3 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp
@@ -363,16 +363,6 @@ LogicalResult coreToAIE(IRRewriter &rewriter, AMDAIE::CoreOp coreOp,
             .Case<AMDAIE::LogicalObjectFifoAcquire>([&](auto acquireOp) {
               return acquireOpToAIE(rewriter, acquireOp, mapper, toBeErased);
             })
-            .Case<AMDAIE::LogicalObjectFifoConsume>([&](auto consumeOp) {
-              // TODO(jornt): get rid of LogicalObjectFifoConsume before this
-              rewriter.eraseOp(consumeOp);
-              return success();
-            })
-            .Case<AMDAIE::LogicalObjectFifoProduce>([&](auto produceOp) {
-              // TODO(jornt): get rid of LogicalObjectFifoProduce before this
-              rewriter.eraseOp(produceOp);
-              return success();
-            })
             .Case<AMDAIE::LogicalObjectFifoRelease>([&](auto releaseOp) {
               return coreReleaseOpToAIE(rewriter, releaseOp, mapper,
                                         toBeErased);
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir
index afdd2fb4e..3dce5d5bb 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir
@@ -12,9 +12,8 @@ func.func @read_access(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>
   %c0 = arith.constant 0 : index
   %tile = amdaie.tile(%c0, %c0)
   %2 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-  %core = amdaie.core(%tile) {
+  %core = amdaie.core(%tile, in : [%2], out : []) {
     %3 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
-    amdaie.logicalobjectfifo.consume(%2)
     linalg.fill ins(%c0_i32 : i32) outs(%3 : memref<1x1x8x16xi32, 2>)
     amdaie.end
   }
@@ -35,10 +34,9 @@ func.func @write_access(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>
   %c0_i32 = arith.constant 0 : i32
   %tile = amdaie.tile(%c0, %c0)
   %2 = amdaie.circular_dma_cpy_nd(%arg1[] [] [], %arg0[] [] []) : (!amdaie.logicalobjectfifo<memref<8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>)
-  %core = amdaie.core(%tile) {
+  %core = amdaie.core(%tile, in : [], out : [%2]) {
     %3 = amdaie.logicalobjectfifo.access(%arg0, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
     linalg.fill ins(%c0_i32 : i32) outs(%3 : memref<1x1x8x16xi32, 2>)
-    amdaie.logicalobjectfifo.produce(%2)
     amdaie.end
   }
   return
@@ -55,7 +53,7 @@ func.func @none_access(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>
   %c0_i32 = arith.constant 0 : i32
   %c0 = arith.constant 0 : index
   %tile = amdaie.tile(%c0, %c0)
-  %core = amdaie.core(%tile) {
+  %core = amdaie.core(%tile, in : [], out : []) {
     %3 = amdaie.logicalobjectfifo.access(%arg0, None) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
     linalg.fill ins(%c0_i32 : i32) outs(%3 : memref<1x1x8x16xi32, 2>)
     amdaie.end
@@ -74,7 +72,7 @@ func.func @any_access(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>)
   %c0_i32 = arith.constant 0 : i32
   %c0 = arith.constant 0 : index
   %tile = amdaie.tile(%c0, %c0)
-  %core = amdaie.core(%tile) {
+  %core = amdaie.core(%tile, in : [], out : []) {
     %3 = amdaie.logicalobjectfifo.access(%arg0, Any) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
     linalg.fill ins(%c0_i32 : i32) outs(%3 : memref<1x1x8x16xi32, 2>)
     amdaie.end
@@ -102,13 +100,11 @@ func.func @read_and_write(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32,
   %tile = amdaie.tile(%c0, %c0)
   %2 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
   %3 = amdaie.circular_dma_cpy_nd(%arg3[] [] [], %arg2[] [] []) : (!amdaie.logicalobjectfifo<memref<8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>)
-  %core = amdaie.core(%tile) {
+  %core = amdaie.core(%tile, in : [%2], out : [%3]) {
     %4 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
     %5 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
-    amdaie.logicalobjectfifo.consume(%2)
     linalg.fill ins(%c0_i32 : i32) outs(%4 : memref<1x1x8x16xi32, 2>)
     linalg.fill ins(%c0_i32 : i32) outs(%5 : memref<1x1x8x16xi32, 2>)
-    amdaie.logicalobjectfifo.produce(%3)
     amdaie.end
   }
   return
@@ -146,21 +142,17 @@ func.func @read_write_multiple_blocks(%arg0: !amdaie.logicalobjectfifo<memref<1x
   %tile = amdaie.tile(%c0, %c0)
   %2 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
   %3 = amdaie.circular_dma_cpy_nd(%arg3[] [] [], %arg2[] [] []) : (!amdaie.logicalobjectfifo<memref<8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>)
-  %core = amdaie.core(%tile) {
+  %core = amdaie.core(%tile, in : [%2], out : [%3]) {
     %4 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
-    amdaie.logicalobjectfifo.consume(%2)
     linalg.fill ins(%c0_i32 : i32) outs(%4 : memref<1x1x8x16xi32, 2>)
     scf.for %arg = %c0 to %c8 step %c1  {
       %5 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
-      amdaie.logicalobjectfifo.consume(%2)
       linalg.fill ins(%c0_i32 : i32) outs(%5 : memref<1x1x8x16xi32, 2>)
     }
     %6 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
     %7 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
-    amdaie.logicalobjectfifo.consume(%2)    
     linalg.fill ins(%c0_i32 : i32) outs(%6 : memref<1x1x8x16xi32, 2>)
     linalg.fill ins(%c0_i32 : i32) outs(%7 : memref<1x1x8x16xi32, 2>)
-    amdaie.logicalobjectfifo.produce(%3)
     amdaie.end
   }
   return
@@ -187,11 +179,9 @@ func.func @multiple_reads_deterministic_order(%arg0: !amdaie.logicalobjectfifo<m
   %tile = amdaie.tile(%c0, %c0)
   %2 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
   %3 = amdaie.circular_dma_cpy_nd(%arg2[] [] [], %arg3[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-  %core = amdaie.core(%tile) {
+  %core = amdaie.core(%tile, in : [%2, %3], out : []) {
     %4 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
     %5 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
-    amdaie.logicalobjectfifo.consume(%2)
-    amdaie.logicalobjectfifo.consume(%3)
     linalg.fill ins(%c0_i32 : i32) outs(%4 : memref<1x1x8x16xi32, 2>)
     linalg.fill ins(%c0_i32 : i32) outs(%5 : memref<1x1x8x16xi32, 2>)
     amdaie.end
@@ -220,13 +210,11 @@ func.func @multiple_writes_deterministic_order(%arg0: !amdaie.logicalobjectfifo<
   %tile = amdaie.tile(%c0, %c0)
   %2 = amdaie.circular_dma_cpy_nd(%arg1[] [] [], %arg0[] [] []) : (!amdaie.logicalobjectfifo<memref<8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>)
   %3 = amdaie.circular_dma_cpy_nd(%arg3[] [] [], %arg2[] [] []) : (!amdaie.logicalobjectfifo<memref<8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>)
-  %core = amdaie.core(%tile) {
+  %core = amdaie.core(%tile, in : [], out : [%2, %3]) {
     %4 = amdaie.logicalobjectfifo.access(%arg0, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
     %5 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
     linalg.fill ins(%c0_i32 : i32) outs(%4 : memref<1x1x8x16xi32, 2>)
     linalg.fill ins(%c0_i32 : i32) outs(%5 : memref<1x1x8x16xi32, 2>)
-    amdaie.logicalobjectfifo.produce(%2)
-    amdaie.logicalobjectfifo.produce(%3)
     amdaie.end
   }
   return
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_doubly_strided_op.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_doubly_strided_op.mlir
index 77950fd7b..951f9f6d8 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_doubly_strided_op.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_doubly_strided_op.mlir
@@ -1,5 +1,5 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-amdaie-canonicalize-doubly-strided-op,canonicalize))" %s | FileCheck %s
-// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=true},canonicalize))" %s | FileCheck %s --check-prefix=FOLD-SINGLE-DIMS
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-amdaie-canonicalize-doubly-strided-op,canonicalize))" -allow-unregistered-dialect %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=true},canonicalize))" -allow-unregistered-dialect %s | FileCheck %s --check-prefix=FOLD-SINGLE-DIMS
 
 // Verify that source and target of `amdaie.circular_dma_cpy_nd` is still correct after canonicalization.
 //
@@ -18,7 +18,7 @@
 // FOLD-SINGLE-DIMS:        amdaie.circular_dma_cpy_nd(%[[ARG0]][] [] [], %[[ARG1]][] [] [])
 func.func @circular_dma_cpy_nd_source_target(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
   %0 = amdaie.circular_dma_cpy_nd(%arg0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[0, 0, 0, 0] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-  amdaie.logicalobjectfifo.consume(%0)
+  "iree.keep"(%0) : (index) -> ()
   return
 }
 
@@ -33,7 +33,7 @@ func.func @circular_dma_cpy_nd_source_target(%arg0: !amdaie.logicalobjectfifo<me
 // FOLD-SINGLE-DIMS:  amdaie.circular_dma_cpy_nd(%{{.+}}[] [] [], %{{.+}}[] [] [])
 func.func @circular_dma_cpy_nd_linear_implicit(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
   %0 = amdaie.circular_dma_cpy_nd(%arg0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[0, 0, 0, 0] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-  amdaie.logicalobjectfifo.consume(%0)
+  "iree.keep"(%0) : (index) -> ()
   return
 }
 
@@ -57,7 +57,7 @@ func.func @circular_dma_cpy_nd_linear_implicit(%arg0: !amdaie.logicalobjectfifo<
 func.func @circular_dma_cpy_nd_linear(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
   %c16 = arith.constant 16 : index
   %0 = amdaie.circular_dma_cpy_nd(%arg0[0, 0, 0, 0] [1, 2, 8, 8] [256, 128, %c16, 1], %arg1[0, 0, 0, 0] [64, 16, 8, %c16] [128, %c16, %c16, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-  amdaie.logicalobjectfifo.consume(%0)
+  "iree.keep"(%0) : (index) -> ()
   return
 }
 
@@ -68,7 +68,7 @@ func.func @circular_dma_cpy_nd_linear(%arg0: !amdaie.logicalobjectfifo<memref<1x
 // FOLD-SINGLE-DIMS:  amdaie.circular_dma_cpy_nd(%{{.+}}[0, 0, 0, 0] [2, 2, 8, 8] [256, 64, 16, 1], %{{.+}}[0, 0, 0, 0] [2, 2, 8, 16] [128, 16, 8, 1])
 func.func @circular_dma_cpy_nd_no_linear(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
   %0 = amdaie.circular_dma_cpy_nd(%arg0[0, 0, 0, 0] [2, 2, 8, 8] [256, 64, 16, 1], %arg1[0, 0, 0, 0] [2, 2, 8, 16] [128, 16, 8, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-  amdaie.logicalobjectfifo.consume(%0)
+  "iree.keep"(%0) : (index) -> ()
   return
 }
 
@@ -90,7 +90,7 @@ func.func @circular_dma_cpy_nd_no_linear(%arg0: !amdaie.logicalobjectfifo<memref
 // FOLD-SINGLE-DIMS:      amdaie.circular_dma_cpy_nd(%{{.+}}[] [] [], %{{.+}}[%[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C8]], %[[C8]]] [%[[C8]], %[[C16]], %[[C1]]])
 func.func @circular_dma_cpy_nd_unit(%arg0: !amdaie.logicalobjectfifo<memref<1x1x2x2x4x8xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>) {
   %0 = amdaie.circular_dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 2, 2, 4, 8] [128, 128, 64, 32, 8, 1], %arg1[0, 0, 0, 0, 0, 0] [1, 1, 2, 2, 4, 8] [128, 128, 8, 64, 16, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x2x2x4x8xi32, 1>>, !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>)
-  amdaie.logicalobjectfifo.consume(%0)
+  "iree.keep"(%0) : (index) -> ()
   return
 }
 
@@ -104,7 +104,7 @@ func.func @circular_dma_cpy_nd_unit(%arg0: !amdaie.logicalobjectfifo<memref<1x1x
 // FOLD-SINGLE-DIMS:  amdaie.circular_dma_cpy_nd(%{{.+}}[] [] [], %{{.+}}[] [] [])
 func.func @circular_dma_cpy_nd_unit_between_linear(%arg0: !amdaie.logicalobjectfifo<memref<1x1x2x2x4x8xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>) {
   %0 = amdaie.circular_dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 2, 2, 4, 1, 8] [128, 64, 32, 8, 8, 1], %arg1[0, 0, 0, 0, 0, 0] [2, 2, 1, 4, 8, 1] [64, 32, 32, 8, 1, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x2x2x4x8xi32, 1>>, !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>)
-  amdaie.logicalobjectfifo.consume(%0)
+  "iree.keep"(%0) : (index) -> ()
   return
 }
 
@@ -115,7 +115,7 @@ func.func @circular_dma_cpy_nd_unit_between_linear(%arg0: !amdaie.logicalobjectf
 // FOLD-SINGLE-DIMS:  amdaie.circular_dma_cpy_nd(%{{.+}}[1, 1, 1, 1] [1, 1, 8, 16] [128, 128, 16, 1], %{{.+}}[1, 1, 1, 1] [1, 4, 2, 8] [64, 16, 8, 1])
 func.func @circular_dma_cpy_nd_non_zero_offset(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
   %0 = amdaie.circular_dma_cpy_nd(%arg0[1, 1, 1, 1] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[1, 1, 1, 1] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-  amdaie.logicalobjectfifo.consume(%0)
+  "iree.keep"(%0) : (index) -> ()
   return
 }
 
@@ -132,7 +132,7 @@ func.func @circular_dma_cpy_nd_non_zero_offset(%arg0: !amdaie.logicalobjectfifo<
 // FOLD-SINGLE-DIMS:      amdaie.circular_dma_cpy_nd(%{{.+}}[%[[C1]]] [%[[C128]]] [%[[C1]]], %{{.+}}[%[[C1]]] [%[[C64]]] [%[[C1]]])
 func.func @circular_dma_cpy_nd_partial_non_zero_offset(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
   %0 = amdaie.circular_dma_cpy_nd(%arg0[0, 0, 0, 1] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[0, 0, 0, 1] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-  amdaie.logicalobjectfifo.consume(%0)
+  "iree.keep"(%0) : (index) -> ()
   return
 }
 
@@ -155,7 +155,7 @@ func.func @circular_dma_cpy_nd_partial_non_zero_offset(%arg0: !amdaie.logicalobj
 // FOLD-SINGLE-DIMS:        amdaie.dma_cpy_nd(%[[ARG0]][] [] [], %[[ARG1]][] [] [])
 func.func @dma_cpy_nd_source_target(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
   %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[0, 0, 0, 0] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-  amdaie.logicalobjectfifo.consume(%0)
+  "iree.keep"(%0) : (index) -> ()
   return
 }
 
@@ -170,7 +170,7 @@ func.func @dma_cpy_nd_source_target(%arg0: !amdaie.logicalobjectfifo<memref<1x1x
 // FOLD-SINGLE-DIMS:  amdaie.dma_cpy_nd(%{{.+}}[] [] [], %{{.+}}[] [] [])
 func.func @dma_cpy_nd_linear_implicit(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
   %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[0, 0, 0, 0] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-  amdaie.logicalobjectfifo.consume(%0)
+  "iree.keep"(%0) : (index) -> ()
   return
 }
 
@@ -194,7 +194,7 @@ func.func @dma_cpy_nd_linear_implicit(%arg0: !amdaie.logicalobjectfifo<memref<1x
 func.func @dma_cpy_nd_linear(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
   %c16 = arith.constant 16 : index
   %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0] [1, 2, 8, 8] [256, 128, %c16, 1], %arg1[0, 0, 0, 0] [64, 16, 8, %c16] [128, %c16, %c16, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-  amdaie.logicalobjectfifo.consume(%0)
+  "iree.keep"(%0) : (index) -> ()
   return
 }
 
@@ -205,7 +205,7 @@ func.func @dma_cpy_nd_linear(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi3
 // FOLD-SINGLE-DIMS:  amdaie.dma_cpy_nd(%{{.+}}[0, 0, 0, 0] [2, 2, 8, 8] [256, 64, 16, 1], %{{.+}}[0, 0, 0, 0] [2, 2, 8, 16] [128, 16, 8, 1])
 func.func @dma_cpy_nd_no_linear(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
   %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0] [2, 2, 8, 8] [256, 64, 16, 1], %arg1[0, 0, 0, 0] [2, 2, 8, 16] [128, 16, 8, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-  amdaie.logicalobjectfifo.consume(%0)
+  "iree.keep"(%0) : (index) -> ()
   return
 }
 
@@ -227,7 +227,7 @@ func.func @dma_cpy_nd_no_linear(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16
 // FOLD-SINGLE-DIMS:      amdaie.dma_cpy_nd(%{{.+}}[] [] [], %{{.+}}[%[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C8]], %[[C8]]] [%[[C8]], %[[C16]], %[[C1]]])
 func.func @dma_cpy_nd_unit(%arg0: !amdaie.logicalobjectfifo<memref<1x1x2x2x4x8xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>) {
   %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 2, 2, 4, 8] [128, 128, 64, 32, 8, 1], %arg1[0, 0, 0, 0, 0, 0] [1, 1, 2, 2, 4, 8] [128, 128, 8, 64, 16, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x2x2x4x8xi32, 1>>, !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>)
-  amdaie.logicalobjectfifo.consume(%0)
+  "iree.keep"(%0) : (index) -> ()
   return
 }
 
@@ -241,7 +241,7 @@ func.func @dma_cpy_nd_unit(%arg0: !amdaie.logicalobjectfifo<memref<1x1x2x2x4x8xi
 // FOLD-SINGLE-DIMS:  amdaie.dma_cpy_nd(%{{.+}}[] [] [], %{{.+}}[] [] [])
 func.func @dma_cpy_nd_unit_between_linear(%arg0: !amdaie.logicalobjectfifo<memref<1x1x2x2x4x8xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>) {
   %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [2, 2, 1, 1, 4, 8] [64, 32, 32, 32, 8, 1], %arg1[0, 0, 0, 0, 0, 0] [2, 1, 2, 1, 4, 8] [64, 64, 32, 32, 8, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x2x2x4x8xi32, 1>>, !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>)
-  amdaie.logicalobjectfifo.consume(%0)
+  "iree.keep"(%0) : (index) -> ()
   return
 }
 
@@ -252,7 +252,7 @@ func.func @dma_cpy_nd_unit_between_linear(%arg0: !amdaie.logicalobjectfifo<memre
 // FOLD-SINGLE-DIMS:  amdaie.dma_cpy_nd(%{{.+}}[1, 1, 1, 1] [1, 1, 8, 16] [128, 128, 16, 1], %{{.+}}[1, 1, 1, 1] [1, 4, 2, 8] [64, 16, 8, 1])
 func.func @dma_cpy_nd_non_zero_offset(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
   %0 = amdaie.dma_cpy_nd(%arg0[1, 1, 1, 1] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[1, 1, 1, 1] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-  amdaie.logicalobjectfifo.consume(%0)
+  "iree.keep"(%0) : (index) -> ()
   return
 }
 
@@ -269,7 +269,7 @@ func.func @dma_cpy_nd_non_zero_offset(%arg0: !amdaie.logicalobjectfifo<memref<1x
 // FOLD-SINGLE-DIMS:      amdaie.dma_cpy_nd(%{{.+}}[%[[C1]]] [%[[C128]]] [%[[C1]]], %{{.+}}[%[[C1]]] [%[[C64]]] [%[[C1]]])
 func.func @dma_cpy_nd_partial_non_zero_offset(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
   %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 1] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[0, 0, 0, 1] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-  amdaie.logicalobjectfifo.consume(%0)
+  "iree.keep"(%0) : (index) -> ()
   return
 }
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_aie_workgroup.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_aie_workgroup.mlir
index b3976a2ab..eb28060b2 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_aie_workgroup.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_aie_workgroup.mlir
@@ -40,18 +40,18 @@ func.func @circular_dma_cpy_nd(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16x
 // CHECK:       amdaie.workgroup
 // CHECK:         %[[TILE_0:.+]] = amdaie.tile
 // CHECK:         %[[TILE_1:.+]] = amdaie.tile
-// CHECK:         %{{.+}} = amdaie.core(%[[TILE_0]])
-// CHECK:         %{{.+}} = amdaie.core(%[[TILE_1]])
+// CHECK:         %{{.+}} = amdaie.core(%[[TILE_0]], in : [], out : [])
+// CHECK:         %{{.+}} = amdaie.core(%[[TILE_1]], in : [], out : [])
 // CHECK:         amdaie.controlcode
 func.func @core() {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %tile_0_0 = amdaie.tile(%c0, %c0)
   %tile_0_1 = amdaie.tile(%c0, %c1)
-  %core_0_0 = amdaie.core(%tile_0_0) {
+  %core_0_0 = amdaie.core(%tile_0_0, in : [], out : []) {
     amdaie.end
   }
-  %core_0_1 = amdaie.core(%tile_0_1) {
+  %core_0_1 = amdaie.core(%tile_0_1, in : [], out : []) {
     amdaie.end
   }
   return
@@ -195,9 +195,9 @@ func.func @for() {
 // CHECK-DAG:     %[[C8:.+]] = arith.constant 8 : index
 // CHECK-DAG:     %[[TILE_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
 // CHECK-DAG:     %[[TILE_1:.+]] = amdaie.tile(%[[C0]], %[[C1]])
-// CHECK-DAG:     %{{.+}} = amdaie.core(%[[TILE_0]])
+// CHECK-DAG:     %{{.+}} = amdaie.core(%[[TILE_0]], in : [], out : [])
 // CHECK:           scf.for %{{.*}} = %[[C0]] to %[[C8]] step %[[C1]]
-// CHECK-DAG:     %{{.+}} = amdaie.core(%[[TILE_1]])
+// CHECK-DAG:     %{{.+}} = amdaie.core(%[[TILE_1]], in : [], out : [])
 // CHECK:           scf.for %{{.*}} = %[[C0]] to %[[C8]] step %[[C1]]
 // CHECK:         amdaie.controlcode
 // CHECK-DAG:       %[[C0_1:.+]] = arith.constant 0 : index
@@ -211,10 +211,10 @@ func.func @for_cores() {
   scf.for %arg2 = %c0 to %c8 step %c1  {
     %tile_0_0 = amdaie.tile(%c0, %c0)
     %tile_0_1 = amdaie.tile(%c0, %c1)
-    %core_0_0 = amdaie.core(%tile_0_0) {
+    %core_0_0 = amdaie.core(%tile_0_0, in : [], out : []) {
       amdaie.end
     }
-    %core_0_1 = amdaie.core(%tile_0_1) {
+    %core_0_1 = amdaie.core(%tile_0_1, in : [], out : []) {
       amdaie.end
     }
   }
@@ -281,9 +281,9 @@ func.func @forall() {
 // CHECK:       amdaie.workgroup
 // CHECK:         %[[TILE_0:.+]] = amdaie.tile
 // CHECK:         %[[TILE_1:.+]] = amdaie.tile
-// CHECK:         %{{.+}} = amdaie.core(%[[TILE_0]])
+// CHECK:         %{{.+}} = amdaie.core(%[[TILE_0]], in : [], out : [])
 // CHECK:           scf.forall (%{{.*}}, %{{.*}}) in (1, 2)
-// CHECK:         %{{.+}} = amdaie.core(%[[TILE_1]])
+// CHECK:         %{{.+}} = amdaie.core(%[[TILE_1]], in : [], out : [])
 // CHECK:           scf.forall (%{{.*}}, %{{.*}}) in (1, 2)
 // CHECK:         amdaie.controlcode
 // CHECK:           scf.forall (%{{.*}}, %{{.*}}) in (1, 2)
@@ -293,10 +293,10 @@ func.func @forall_cores() {
   scf.forall (%arg2, %arg3) in (1, 2) {
     %tile_0_0 = amdaie.tile(%c0, %c0)
     %tile_0_1 = amdaie.tile(%c0, %c1)
-    %core_0_0 = amdaie.core(%tile_0_0) {
+    %core_0_0 = amdaie.core(%tile_0_0, in : [], out : []) {
       amdaie.end
     }
-    %core_0_1 = amdaie.core(%tile_0_1) {
+    %core_0_1 = amdaie.core(%tile_0_1, in : [], out : []) {
       amdaie.end
     }
   }
@@ -351,11 +351,11 @@ func.func @forall_dmas(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 1>)
 // CHECK-SAME:    %[[FROMMEMREF0]][] [] []
 // CHECK-SAME:    %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]
 // CHECK-SAME:    (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-// CHECK-DAG:     %{{.+}} = amdaie.core(%[[TILE_0]])
-// CHECK:           amdaie.logicalobjectfifo.consume(%[[DMA]])
+// CHECK-DAG:     %{{.+}} = amdaie.core(%[[TILE_0]], in : [], out : [])
+// CHECK:           amdaie.logicalobjectfifo.access(%[[FROMMEMREF0]], Read)
 // CHECK:           scf.for %{{.*}} = %[[C0]] to %[[C8]] step %[[C1]]
-// CHECK-DAG:     %{{.+}} = amdaie.core(%[[TILE_1]])
-// CHECK:           amdaie.logicalobjectfifo.consume(%[[DMA]])
+// CHECK-DAG:     %{{.+}} = amdaie.core(%[[TILE_1]], in : [], out : [])
+// CHECK:           amdaie.logicalobjectfifo.access(%[[FROMMEMREF0]], Read)
 // CHECK:           scf.for %{{.*}} = %[[C0]] to %[[C8]] step %[[C1]]
 // CHECK:         amdaie.controlcode
 // CHECK-DAG:       %[[C0_1:.+]] = arith.constant 0 : index
@@ -375,19 +375,19 @@ func.func @merge_cores(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 1>)
   %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>
   %1 = amdaie.logicalobjectfifo.from_memref %arg1, {} : memref<8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>
   %2 = amdaie.dma_cpy_nd(%0[] [] [], %1[0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-  %core_0_0_0 = amdaie.core(%tile_0_0) {
-    amdaie.logicalobjectfifo.consume(%2)
+  %core_0_0_0 = amdaie.core(%tile_0_0, in : [], out : []) {
+    amdaie.logicalobjectfifo.access(%0, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>> -> memref<1x1x8x16xi32>
     amdaie.end
   }
-  %core_0_1_0 = amdaie.core(%tile_0_1) {
-    amdaie.logicalobjectfifo.consume(%2)
+  %core_0_1_0 = amdaie.core(%tile_0_1, in : [], out : []) {
+    amdaie.logicalobjectfifo.access(%0, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>> -> memref<1x1x8x16xi32>
     amdaie.end
   }
   scf.for %arg2 = %c0 to %c8 step %c1  {
-    %core_0_0_1 = amdaie.core(%tile_0_0) {
+    %core_0_0_1 = amdaie.core(%tile_0_0, in : [], out : []) {
       amdaie.end
     }
-    %core_0_1_1 = amdaie.core(%tile_0_1) {
+    %core_0_1_1 = amdaie.core(%tile_0_1, in : [], out : []) {
       amdaie.end
     }
   }
@@ -427,15 +427,15 @@ func.func @merge_cores(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 1>)
 // CHECK-SAME:    %[[FROMMEMREF4]][] [] []
 // CHECK-SAME:    %[[FROMMEMREF5]][0, 0, 0, 0] [1, 1, 32, 16] [128, 16, 8, 1]
 // CHECK-SAME:    (!amdaie.logicalobjectfifo<memref<1x1x32x16xi32>>, !amdaie.logicalobjectfifo<memref<32x16xi32, 1>>)
-// CHECK-DAG:     %{{.+}} = amdaie.core(%[[TILE_0]])
-// CHECK:           amdaie.logicalobjectfifo.consume(%[[DMA0]])
+// CHECK-DAG:     %{{.+}} = amdaie.core(%[[TILE_0]], in : [], out : [])
+// CHECK:           amdaie.logicalobjectfifo.access(%[[FROMMEMREF0]], Read)
 // CHECK:           scf.for %{{.*}} = %[[C0]] to %[[C8]] step %[[C1]]
-// CHECK:             amdaie.logicalobjectfifo.consume(%[[DMA1]])
+// CHECK:           amdaie.logicalobjectfifo.access(%[[FROMMEMREF2]], Read)
 // CHECK:             linalg.fill
-// CHECK-DAG:     %{{.+}} = amdaie.core(%[[TILE_1]])
-// CHECK:           amdaie.logicalobjectfifo.consume(%[[DMA0]])
+// CHECK-DAG:     %{{.+}} = amdaie.core(%[[TILE_1]], in : [], out : [])
+// CHECK:           amdaie.logicalobjectfifo.access(%[[FROMMEMREF0]], Read)
 // CHECK:           scf.for %{{.*}} = %[[C0]] to %[[C8]] step %[[C1]]
-// CHECK:             amdaie.logicalobjectfifo.consume(%[[DMA2]])
+// CHECK:           amdaie.logicalobjectfifo.access(%[[FROMMEMREF4]], Read)
 // CHECK:             linalg.fill
 // CHECK:         amdaie.controlcode
 // CHECK-DAG:       %[[C0_1:.+]] = arith.constant 0 : index
@@ -468,24 +468,24 @@ func.func @complex_example(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32,
   %4 = amdaie.logicalobjectfifo.from_memref %arg4, {} : memref<1x1x32x16xi32> -> !amdaie.logicalobjectfifo<memref<1x1x32x16xi32>>
   %5 = amdaie.logicalobjectfifo.from_memref %arg5, {} : memref<32x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<32x16xi32, 1>>
   %dma_0 = amdaie.dma_cpy_nd(%0[] [] [], %1[0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
-  %core_0_0_0 = amdaie.core(%tile_0_0) {
-    amdaie.logicalobjectfifo.consume(%dma_0)
+  %core_0_0_0 = amdaie.core(%tile_0_0, in : [], out : []) {
+    amdaie.logicalobjectfifo.access(%0, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>> -> memref<1x1x8x16xi32>
     amdaie.end
   }
-  %core_0_1_0 = amdaie.core(%tile_0_1) {
-    amdaie.logicalobjectfifo.consume(%dma_0)
+  %core_0_1_0 = amdaie.core(%tile_0_1, in : [], out : []) {
+    amdaie.logicalobjectfifo.access(%0, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32>> -> memref<1x1x8x16xi32>
     amdaie.end
   }
   scf.for %iv0 = %c0 to %c8 step %c1  {
     %dma_1 = amdaie.dma_cpy_nd(%2[] [] [], %3[0, 0, 0, 0] [1, 1, 16, 16] [128, 16, 8, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x16xi32>>, !amdaie.logicalobjectfifo<memref<16x16xi32, 1>>)
     %dma_2 = amdaie.dma_cpy_nd(%4[] [] [], %5[0, 0, 0, 0] [1, 1, 32, 16] [128, 16, 8, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x16xi32>>, !amdaie.logicalobjectfifo<memref<32x16xi32, 1>>)
-    %core_0_0_1 = amdaie.core(%tile_0_0) {
-      amdaie.logicalobjectfifo.consume(%dma_1)
+    %core_0_0_1 = amdaie.core(%tile_0_0, in : [], out : []) {
+      amdaie.logicalobjectfifo.access(%2, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x16xi32>> -> memref<1x1x16x16xi32>
       linalg.fill ins(%c0_i32 : i32) outs(%arg2 : memref<1x1x16x16xi32>)
       amdaie.end
     }
-    %core_0_1_1 = amdaie.core(%tile_0_1) {
-      amdaie.logicalobjectfifo.consume(%dma_2)
+    %core_0_1_1 = amdaie.core(%tile_0_1, in : [], out : []) {
+      amdaie.logicalobjectfifo.access(%4, Read) : !amdaie.logicalobjectfifo<memref<1x1x32x16xi32>> -> memref<1x1x32x16xi32>
       linalg.fill ins(%c0_i32 : i32) outs(%arg4 : memref<1x1x32x16xi32>)
       amdaie.end
     }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir
index db150b117..3e1a28e8c 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir
@@ -11,20 +11,20 @@
 // CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
 // CHECK:       scf.forall (%[[ARG0:.*]], %[[ARG1:.*]]) in (1, 1) {
 // CHECK:         %[[TILE_0:.*]] = amdaie.tile(%[[C0]], %[[C2]])
-// CHECK:         %{{.*}} = amdaie.core(%[[TILE_0]])
+// CHECK:         %{{.*}} = amdaie.core(%[[TILE_0]], in : [], out : [])
 // CHECK:         %[[TILE_1:.*]] = amdaie.tile(%[[C1]], %[[C2]])
-// CHECK:         %{{.*}} = amdaie.core(%[[TILE_1]])
+// CHECK:         %{{.*}} = amdaie.core(%[[TILE_1]], in : [], out : [])
 // CHECK:         %[[TILE_2:.*]] = amdaie.tile(%[[C2]], %[[C2]])
-// CHECK:         %{{.*}} = amdaie.core(%[[TILE_2]])
+// CHECK:         %{{.*}} = amdaie.core(%[[TILE_2]], in : [], out : [])
 // CHECK:         %[[TILE_3:.*]] = amdaie.tile(%[[C3]], %[[C2]])
-// CHECK:         %{{.*}} = amdaie.core(%[[TILE_3]])
+// CHECK:         %{{.*}} = amdaie.core(%[[TILE_3]], in : [], out : [])
 module {
   func.func @distribute_cores_and_objectfifos_1x4() {
     %c2 = arith.constant 2 : index
     scf.forall (%arg0, %arg1) in (1, 1) {
       scf.forall (%arg2, %arg3) in (1, 4) {
         %tile = amdaie.tile(%arg3, %c2)
-        %21 = amdaie.core(%tile) {
+        %21 = amdaie.core(%tile, in : [], out : []) {
           amdaie.end
         }
       } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
@@ -43,19 +43,19 @@ module {
 // CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
 // CHECK:       scf.forall
 // CHECK-DAG:     %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]])
-// CHECK-DAG:     %[[CORE_0_0:.*]] = amdaie.core(%[[TILE_0_0]])
+// CHECK-DAG:     %[[CORE_0_0:.*]] = amdaie.core(%[[TILE_0_0]], in : [], out : [])
 // CHECK-DAG:     %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]])
-// CHECK-DAG:     %[[CORE_0_1:.*]] = amdaie.core(%[[TILE_0_1]])
+// CHECK-DAG:     %[[CORE_0_1:.*]] = amdaie.core(%[[TILE_0_1]], in : [], out : [])
 // CHECK-DAG:     %[[TILE_1_0:.*]] = amdaie.tile(%[[C1]], %[[C0]])
-// CHECK-DAG:     %[[CORE_1_0:.*]] = amdaie.core(%[[TILE_1_0]])
+// CHECK-DAG:     %[[CORE_1_0:.*]] = amdaie.core(%[[TILE_1_0]], in : [], out : [])
 // CHECK-DAG:     %[[TILE_1_1:.*]] = amdaie.tile(%[[C1]], %[[C1]])
-// CHECK-DAG:     %[[CORE_1_1:.*]] = amdaie.core(%[[TILE_1_1]])
+// CHECK-DAG:     %[[CORE_1_1:.*]] = amdaie.core(%[[TILE_1_1]], in : [], out : [])
 module {
   func.func @distribute_cores_and_objectfifos_2x2() {
     scf.forall (%arg0, %arg1) in (1, 1) {
       scf.forall (%arg2, %arg3) in (2, 2) {
         %tile = amdaie.tile(%arg3, %arg2)
-        %0 = amdaie.core(%tile) {
+        %0 = amdaie.core(%tile, in : [], out : []) {
           amdaie.end
         }
       } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
@@ -85,14 +85,12 @@ module {
 // CHECK-DAG:     %[[FROM_MEMREF_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_2]]}
 // CHECK-DAG:     %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]]
 // CHECK-SAME:    %[[FROM_MEMREF_0]]
-// CHECK-DAG:     %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]])
-// CHECK:           amdaie.logicalobjectfifo.consume(%[[DMA_0]])
+// CHECK-DAG:     %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]], in : [%[[DMA_0]]], out : [])
 // CHECK:           %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read)
 // CHECK:           linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>)
 // CHECK-DAG:     %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]]
 // CHECK-SAME:    %[[FROM_MEMREF_0]]
-// CHECK-DAG:     %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]])
-// CHECK:           amdaie.logicalobjectfifo.consume(%[[DMA_1]])
+// CHECK-DAG:     %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]], in : [%[[DMA_1]]], out : [])
 // CHECK:           %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
 // CHECK:           linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>)
 module {
@@ -107,8 +105,7 @@ module {
       scf.forall (%arg2, %arg3) in (1, 2) {
         %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[%arg3, %arg3] [%arg3, %arg3] [%arg3, %arg3]) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>)
         %tile = amdaie.tile(%arg3, %c2)
-        %3 = amdaie.core(%tile) {
-          amdaie.logicalobjectfifo.consume(%2)
+        %3 = amdaie.core(%tile, in : [%2], out : []) {
           linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>)
           amdaie.end
         }
@@ -140,12 +137,10 @@ module {
 // CHECK-DAG:     %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]], %[[TILE_1_2]]}
 // CHECK-DAG:     %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]]
 // CHECK-SAME:    %[[FROM_MEMREF_0]]
-// CHECK-DAG:     %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]])
-// CHECK:           amdaie.logicalobjectfifo.consume(%[[DMA_0]])
+// CHECK-DAG:     %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]], in : [%[[DMA_0]]], out : [])
 // CHECK:           %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
 // CHECK:           linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>)
-// CHECK-DAG:     %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]])
-// CHECK:           amdaie.logicalobjectfifo.consume(%[[DMA_0]])
+// CHECK-DAG:     %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]], in : [%[[DMA_0]]], out : [])
 // CHECK:           %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
 // CHECK:           linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>)
 module {
@@ -160,8 +155,7 @@ module {
       scf.forall (%arg2, %arg3) in (1, 2) {
         %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>)
         %tile = amdaie.tile(%arg3, %c2)
-        %3 = amdaie.core(%tile) {
-          amdaie.logicalobjectfifo.consume(%2)
+        %3 = amdaie.core(%tile, in : [%2], out : []) {
           linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>)
           amdaie.end
         }
@@ -198,9 +192,9 @@ module {
 // CHECK-DAG:     %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]], %[[TILE_0_3]]}
 // CHECK-DAG:     %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]]
 // CHECK-NOT:     amdaie.dma_cpy_nd
-// CHECK-DAG:     amdaie.core(%[[TILE_0_2]])
+// CHECK-DAG:     amdaie.core(%[[TILE_0_2]], in : [%[[DMA_0]]], out : [])
 // CHECK:           %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
-// CHECK-DAG:     amdaie.core(%[[TILE_0_3]])
+// CHECK-DAG:     amdaie.core(%[[TILE_0_3]], in : [%[[DMA_0]]], out : [])
 // CHECK:           %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
 #map = affine_map<(d0) -> (d0 * 32)>
 module {
@@ -217,8 +211,7 @@ module {
         %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[%apply] [%c2] [%c2]) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>)
         %add = arith.addi %arg2, %c2 : index
         %tile = amdaie.tile(%arg3, %add)
-        %3 = amdaie.core(%tile) {
-          amdaie.logicalobjectfifo.consume(%2)
+        %3 = amdaie.core(%tile, in : [%2], out : []) {
           linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>)
           amdaie.end
         }
@@ -252,12 +245,12 @@ module {
 // CHECK-DAG:     %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]}
 // CHECK-DAG:     %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_3]]}
 // CHECK-DAG:     %[[FROM_MEMREF_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]]}
-// CHECK-DAG:     %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]]
-// CHECK-DAG:     amdaie.core(%[[TILE_0_2]])
-// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
-// CHECK-DAG:     %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]]
-// CHECK-DAG:     amdaie.core(%[[TILE_0_3]])
+// CHECK-DAG:     %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]]
+// CHECK-DAG:     amdaie.core(%[[TILE_0_2]], in : [%[[DMA_0]]], out : [])
 // CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read)
+// CHECK-DAG:     %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]]
+// CHECK-DAG:     amdaie.core(%[[TILE_0_3]], in : [%[[DMA_1]]], out : [])
+// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
 #map = affine_map<(d0) -> (d0 * 32)>
 module {
   func.func @unroll_dma_and_affine_single_loop() {
@@ -273,8 +266,7 @@ module {
         %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[%apply] [%c2] [%c2]) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>)
         %add = arith.addi %arg2, %c2 : index
         %tile = amdaie.tile(%arg3, %add)
-        %3 = amdaie.core(%tile) {
-          amdaie.logicalobjectfifo.consume(%2)
+        %3 = amdaie.core(%tile, in : [%2], out : []) {
           linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>)
           amdaie.end
         }
@@ -309,18 +301,14 @@ module {
 // CHECK-DAG:     %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]], %[[TILE_0_3]], %[[TILE_1_2]], %[[TILE_1_3]]}
 // CHECK-DAG:     %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]]
 // CHECK-SAME:    %[[FROM_MEMREF_0]]
-// CHECK-DAG:     %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]])
+// CHECK-DAG:     %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]], in : [%[[DMA_0]]], out : [])
 // CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
-// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_0]])
-// CHECK-DAG:     %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]])
+// CHECK-DAG:     %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]], in : [%[[DMA_0]]], out : [])
 // CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
-// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_0]])
-// CHECK-DAG:     %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]])
+// CHECK-DAG:     %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]], in : [%[[DMA_0]]], out : [])
 // CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
-// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_0]])
-// CHECK-DAG:     %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]])
+// CHECK-DAG:     %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]], in : [%[[DMA_0]]], out : [])
 // CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
-// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_0]])
 module {
   func.func @hoist_dma_multi_loop() {
     %c0_i32 = arith.constant 0 : i32
@@ -334,8 +322,7 @@ module {
         %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>)
         %add = arith.addi %arg2, %c2 : index
         %tile = amdaie.tile(%arg3, %add)
-        %3 = amdaie.core(%tile) {
-          amdaie.logicalobjectfifo.consume(%2)
+        %3 = amdaie.core(%tile, in : [%2], out : []) {
           linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>)
           amdaie.end
         }
@@ -374,14 +361,14 @@ module {
 // CHECK-SAME:    %[[FROM_MEMREF_0]]
 // CHECK-DAG:     %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]]
 // CHECK-SAME:    %[[FROM_MEMREF_0]]
-// CHECK-DAG:     %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]])
-// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_0]])
-// CHECK-DAG:     %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]])
-// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_1]])
-// CHECK-DAG:     %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]])
-// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_0]])
-// CHECK-DAG:     %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]])
-// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_1]])
+// CHECK-DAG:     %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]], in : [%[[DMA_0]]], out : [])
+// CHECK-DAG:       amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read)
+// CHECK-DAG:     %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]], in : [%[[DMA_0]]], out : [])
+// CHECK-DAG:       amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read)
+// CHECK-DAG:     %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]], in : [%[[DMA_1]]], out : [])
+// CHECK-DAG:       amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
+// CHECK-DAG:     %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]], in : [%[[DMA_1]]], out : [])
+// CHECK-DAG:       amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
 module {
   func.func @hoist_dma_one_of_multi_loop() {
     %c0_i32 = arith.constant 0 : i32
@@ -395,8 +382,7 @@ module {
         %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[%arg3] [%arg3] [%arg3]) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>)
         %add = arith.addi %arg2, %c2 : index
         %tile = amdaie.tile(%arg3, %add)
-        %3 = amdaie.core(%tile) {
-          amdaie.logicalobjectfifo.consume(%2)
+        %3 = amdaie.core(%tile, in : [%2], out : []) {
           linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>)
           amdaie.end
         }
@@ -445,21 +431,17 @@ module {
 // CHECK-SAME:    %[[FROM_MEMREF_0]]
 // CHECK-DAG:     %[[DMA_3:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]]
 // CHECK-SAME:    %[[FROM_MEMREF_1]]
-// CHECK-DAG:     %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]])
+// CHECK-DAG:     %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]], in : [%[[DMA_1]]], out : [])
 // CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_4]], Read)
-// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_1]])
 // CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>)
-// CHECK-DAG:     %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]])
+// CHECK-DAG:     %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]], in : [%[[DMA_3]]], out : [])
 // CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read)
-// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_3]])
 // CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>)
-// CHECK-DAG:     %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]])
+// CHECK-DAG:     %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]], in : [%[[DMA_1]]], out : [])
 // CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_4]], Read)
-// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_1]])
 // CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>)
-// CHECK-DAG:     %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]])
+// CHECK-DAG:     %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]], in : [%[[DMA_3]]], out : [])
 // CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read)
-// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_3]])
 // CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>)
 module {
   func.func @hoist_dma_dependencies() {
@@ -477,8 +459,7 @@ module {
         %4 = amdaie.dma_cpy_nd(%2[] [] [], %1[] [] []) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x64xi32, 1>>)
         %add = arith.addi %arg2, %c2 : index
         %tile = amdaie.tile(%arg3, %add)
-        %core = amdaie.core(%tile) {
-          amdaie.logicalobjectfifo.consume(%4)
+        %core = amdaie.core(%tile, in : [%4], out : []) {
           linalg.fill ins(%c0_i32 : i32) outs(%alloc_2 : memref<32x64xi32, 2>)
           amdaie.end
         }
@@ -526,35 +507,31 @@ module {
 // CHECK-DAG:     %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]][] [] [], %[[FROM_MEMREF_0]][%[[ARG1]]]
 // CHECK-DAG:     %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]][] [] [], %[[FROM_MEMREF_1]]
 // CHECK-DAG:     %[[DMA_2:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c0, %c0] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_4]]
-// CHECK:         %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]])
-// CHECK:           amdaie.logicalobjectfifo.consume(%[[DMA_1]])
-// CHECK:           %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read)
-// CHECK:           linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>)
-// CHECK:           %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_4]], Write)
-// CHECK:           linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>)
-// CHECK:         %[[DMA_3:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c0, %c1] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_5]]
-// CHECK:         %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]])
-// CHECK:           amdaie.logicalobjectfifo.consume(%[[DMA_1]])
-// CHECK:           %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read)
-// CHECK:           linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>)
-// CHECK:           %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_5]], Write)
-// CHECK:           linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>)
+// CHECK-DAG:     %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]], in : [%[[DMA_1]]], out : [%[[DMA_2]]])
+// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_4]], Write)
+// CHECK-DAG:       %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read)
+// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>)
+// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>)
+// CHECK-DAG:     %[[DMA_3:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c0, %c1] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_5]]
+// CHECK-DAG:     %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]], in : [%[[DMA_1]]], out : [%[[DMA_3]]])
+// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_5]], Write)
+// CHECK-DAG:       %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read)
+// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>)
+// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>)
 // CHECK-DAG:     %[[DMA_4:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]][] [] [], %[[FROM_MEMREF_1]]
 // CHECK-DAG:     %[[DMA_5:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c1, %c0] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_6]]
-// CHECK:         %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]])
-// CHECK:           amdaie.logicalobjectfifo.consume(%[[DMA_4]])
-// CHECK:           %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read)
-// CHECK:           linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>)
-// CHECK:           %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_6]], Write)
-// CHECK:           linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>)
-// CHECK:         %[[DMA_6:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c1, %c1] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_7]]
-// CHECK:         %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]])
-// CHECK:           amdaie.logicalobjectfifo.consume(%[[DMA_4]])
-// CHECK:           %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read)
-// CHECK:           linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>)
-// CHECK:           %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_7]], Write)
-// CHECK:           linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>)
-// CHECK:         %[[DMA_7:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_9]][%[[ARG1]]] [%c1] [%c1], %[[FROM_MEMREF_8]]
+// CHECK-DAG:     %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]], in : [%[[DMA_4]]], out : [%[[DMA_5]]])
+// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_6]], Write)
+// CHECK-DAG:       %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read)
+// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>)
+// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>)
+// CHECK-DAG:     %[[DMA_6:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c1, %c1] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_7]]
+// CHECK-DAG:     %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]], in : [%[[DMA_4]]], out : [%[[DMA_6]]])
+// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_7]], Write)
+// CHECK-DAG:       %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read)
+// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>)
+// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>)
+// CHECK-DAG:     %[[DMA_7:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_9]][%[[ARG1]]] [%c1] [%c1], %[[FROM_MEMREF_8]]
 module {
   func.func @nested_dma_dependencies() {
     %c0_i32 = arith.constant 0 : i32
@@ -579,11 +556,9 @@ module {
         %8 = amdaie.dma_cpy_nd(%4[%arg2, %arg3] [%c1, %c1] [%c1, %c1], %3[] [] []) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1>>, !amdaie.logicalobjectfifo<memref<32x32xi32, 2>>)
         %add = arith.addi %arg2, %c2 : index
         %tile = amdaie.tile(%arg3, %add)
-        %core = amdaie.core(%tile) {
-          amdaie.logicalobjectfifo.consume(%7)
+        %core = amdaie.core(%tile, in : [%7], out : [%8]) {
           linalg.fill ins(%c0_i32 : i32) outs(%alloc_2 : memref<32x64xi32, 2>)
           linalg.fill ins(%c0_i32 : i32) outs(%alloc_3 : memref<32x32xi32, 2>)
-          amdaie.logicalobjectfifo.produce(%8)
           amdaie.end
         }
       } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
@@ -625,25 +600,21 @@ module {
 // CHECK-DAG:     %[[FROM_MEMREF_4:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_1]]}
 // CHECK-DAG:     %[[FROM_MEMREF_5:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_0]]}
 // CHECK-DAG:     %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]][%c0, %c0] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_0]][%c0, %c0] [%c32, %c32] [%c32, %c1]
-// CHECK-DAG:     %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]])
+// CHECK-DAG:     %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]], in : [], out : [%[[DMA_0]]])
 // CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_0]], Write)
 // CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<1x1x32x32xi32, 2>)
-// CHECK-DAG:       amdaie.logicalobjectfifo.produce(%[[DMA_0]])
 // CHECK-DAG:     %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]][%c0, %c1] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_1]][%c0, %c0] [%c32, %c32] [%c32, %c1]
-// CHECK-DAG:     %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]])
+// CHECK-DAG:     %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]], in : [], out : [%[[DMA_1]]])
 // CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Write)
 // CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<1x1x32x32xi32, 2>)
-// CHECK-DAG:       amdaie.logicalobjectfifo.produce(%[[DMA_1]])
 // CHECK-DAG:     %[[DMA_2:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]][%c1, %c0] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_2]][%c0, %c0] [%c32, %c32] [%c32, %c1]
-// CHECK-DAG:     %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]])
+// CHECK-DAG:     %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]], in : [], out : [%[[DMA_2]]])
 // CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Write)
 // CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<1x1x32x32xi32, 2>)
-// CHECK-DAG:       amdaie.logicalobjectfifo.produce(%[[DMA_2]])
 // CHECK-DAG:     %[[DMA_3:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]][%c1, %c1] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_3]][%c0, %c0] [%c32, %c32] [%c32, %c1]
-// CHECK-DAG:     %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]])
+// CHECK-DAG:     %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]], in : [], out : [%[[DMA_3]]])
 // CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Write)
 // CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<1x1x32x32xi32, 2>)
-// CHECK-DAG:       amdaie.logicalobjectfifo.produce(%[[DMA_3]])
 module {
   func.func @local_subview_output() {
     %c0_i32 = arith.constant 0 : i32
@@ -661,9 +632,8 @@ module {
         %8 = amdaie.dma_cpy_nd(%1[%arg2, %arg3] [%c1, %c1] [%c1, %c1], %0[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 32, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 2>>)
         %add = arith.addi %arg2, %c2 : index
         %tile = amdaie.tile(%arg3, %add)
-        %core = amdaie.core(%tile) {
+        %core = amdaie.core(%tile, in : [], out : [%8]) {
           linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 2>)
-          amdaie.logicalobjectfifo.produce(%8)
           amdaie.end
         }
       } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
@@ -702,7 +672,7 @@ func.func @l1_temporary_buffer_for_matmul_elem() {
         %subview = memref.subview %alloc_6[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
         %26 = arith.addi %arg2, %c2 : index
         %tile = amdaie.tile(%arg3, %26)
-        %27 = amdaie.core(%tile) {
+        %27 = amdaie.core(%tile, in : [], out : []) {
             linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>)
             amdaie.end
         }
@@ -757,13 +727,10 @@ func.func @l1_temporary_buffer_for_matmul_elem() {
 // CHECK-SAME:        %[[FROM_MEMREF_6]]
 // CHECK-DAG:         %[[DMA_5:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_10]]
 // CHECK-SAME:        %[[FROM_MEMREF_4]]
-// CHECK-DAG:         %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]])
+// CHECK-DAG:         %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]], in : [%[[DMA_1]], %[[DMA_3]]], out : [%[[DMA_4]]])
 // CHECK-DAG:           %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_7]], Read)
 // CHECK-DAG:           %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_9]], Read)
 // CHECK-DAG:           %[[VAL_2:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_6]], Write)
-// CHECK-DAG:           amdaie.logicalobjectfifo.consume(%[[DMA_1]])
-// CHECK-DAG:           amdaie.logicalobjectfifo.consume(%[[DMA_3]])
-// CHECK-DAG:           amdaie.logicalobjectfifo.produce(%[[DMA_4]])
 // CHECK-DAG:         %[[DMA_6:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]]
 // CHECK-SAME:        %[[FROM_MEMREF_12]]
 // CHECK-DAG:         %[[DMA_7:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]]
@@ -772,13 +739,10 @@ func.func @l1_temporary_buffer_for_matmul_elem() {
 // CHECK-SAME:        %[[FROM_MEMREF_5]]
 // CHECK-DAG:         %[[DMA_9:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_10]]
 // CHECK-SAME:        %[[FROM_MEMREF_3]]
-// CHECK-DAG:         %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]])
+// CHECK-DAG:         %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]], in : [%[[DMA_1]], %[[DMA_7]]], out : [%[[DMA_8]]])
 // CHECK-DAG:           %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_7]], Read)
 // CHECK-DAG:           %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_8]], Read)
 // CHECK-DAG:           %[[VAL_2:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_5]], Write)
-// CHECK-DAG:           amdaie.logicalobjectfifo.consume(%[[DMA_1]])
-// CHECK-DAG:           amdaie.logicalobjectfifo.consume(%[[DMA_7]])
-// CHECK-DAG:           amdaie.logicalobjectfifo.produce(%[[DMA_8]])
 #map = affine_map<(d0) -> (d0 * 32)>
 #map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
 #map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
@@ -829,16 +793,13 @@ module {
         %19 = amdaie.dma_cpy_nd(%9[%12, %13] [%c32, %c32] [%c64, %c1], %5[] [] []) : (!amdaie.logicalobjectfifo<memref<32x64xi32>>, !amdaie.logicalobjectfifo<memref<32x32xi32, 1>>)
         %20 = arith.addi %arg2, %c2 : index
         %tile = amdaie.tile(%arg3, %20)
-        %21 = amdaie.core(%tile) {
-          amdaie.logicalobjectfifo.consume(%16)
-          amdaie.logicalobjectfifo.consume(%17)
+        %21 = amdaie.core(%tile, in : [%16, %17], out : [%18]) {
           linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<8x8x4x8xi32, 2>, memref<4x8x8x8xi32, 2>) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) {
           ^bb0(%in: i32, %in_5: i32, %out: i32):
             %22 = arith.muli %in, %in_5 : i32
             %23 = arith.addi %out, %22 : i32
             linalg.yield %23 : i32
           }
-          amdaie.logicalobjectfifo.produce(%18)
           amdaie.end
         }
       } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
@@ -880,10 +841,7 @@ module {
 // CHECK-SAME:        %[[FROM_MEMREF_0]]
 // CHECK-DAG:         %[[DMA_2:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]]
 // CHECK-SAME:        %[[FROM_MEMREF_5]]
-// CHECK-DAG:         %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]])
-// CHECK-DAG:           amdaie.logicalobjectfifo.consume(%[[DMA_0]])
-// CHECK-DAG:           amdaie.logicalobjectfifo.consume(%[[DMA_1]])
-// CHECK-DAG:           amdaie.logicalobjectfifo.produce(%[[DMA_2]])
+// CHECK-DAG:         %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]], in : [%[[DMA_0]], %[[DMA_1]]], out : [%[[DMA_2]]])
 // CHECK-DAG:           %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read)
 // CHECK-DAG:           %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_4]], Read)
 // CHECK-DAG:           %[[VAL_2:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_5]], Write)
@@ -941,9 +899,7 @@ module {
         %21 = amdaie.dma_cpy_nd(%2[%arg2, %arg3, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %17[%arg2, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x64x64xf32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x16x16x4x4xf32, 2 : i32>>)
         %22 = arith.addi %arg2, %c2 : index
         %tile = amdaie.tile(%arg3, %22)
-        %23 = amdaie.core(%tile) {
-          amdaie.logicalobjectfifo.consume(%19)
-          amdaie.logicalobjectfifo.consume(%20)
+        %23 = amdaie.core(%tile, in : [%19, %20], out : [%21]) {
           scf.for %arg4 = %c0 to %c16 step %c1 {
             scf.for %arg5 = %c0 to %c16 step %c1 {
               scf.for %arg6 = %c0 to %c8 step %c1 {
@@ -957,7 +913,6 @@ module {
               }
             }
           }
-          amdaie.logicalobjectfifo.produce(%21)
           amdaie.end
         }
       } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
@@ -995,12 +950,10 @@ module {
 // CHECK-DAG:         %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]]
 // CHECK-SAME:        %[[FROM_MEMREF_0]]
 // CHECK-DAG:         %[[FROM_MEMREF_4:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_5]], {%[[TILE_0_2]]}
-// CHECK-DAG:         %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]])
+// CHECK-DAG:         %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]], in : [%[[DMA_0]], %[[DMA_1]]], out : [])
 // CHECK-DAG:           %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read)
 // CHECK-DAG:           %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read)
 // CHECK-DAG:           %[[VAL_2:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_4]], None)
-// CHECK-DAG:           amdaie.logicalobjectfifo.consume(%[[DMA_0]])
-// CHECK-DAG:           amdaie.logicalobjectfifo.consume(%[[DMA_1]])
 // CHECK-DAG:           linalg.fill
 // CHECK-DAG:           memref.extract_strided_metadata %[[VAL_1]]
 // CHECK-DAG:           memref.extract_strided_metadata %[[VAL_0]]
@@ -1043,9 +996,7 @@ module {
         %subview = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
         %21 = arith.addi %arg2, %c2 : index
         %tile = amdaie.tile(%arg3, %21)
-        %22 = amdaie.core(%tile) {
-          amdaie.logicalobjectfifo.consume(%19)
-          amdaie.logicalobjectfifo.consume(%20)
+        %22 = amdaie.core(%tile, in : [%19, %20], out : []) {
           linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>)
           %base_buffer, %offset, %sizes:6, %strides:6 = memref.extract_strided_metadata %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32> -> memref<i32, 2 : i32>, index, index, index, index, index, index, index, index, index, index, index, index, index
           %base_buffer_5, %offset_6, %sizes_7:6, %strides_8:6 = memref.extract_strided_metadata %alloc : memref<1x1x8x4x8x4xi32, 2 : i32> -> memref<i32, 2 : i32>, index, index, index, index, index, index, index, index, index, index, index, index, index
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/flatten_logical_objectfifo.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/flatten_logical_objectfifo.mlir
index ac3c01cf2..60e6330c2 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/flatten_logical_objectfifo.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/flatten_logical_objectfifo.mlir
@@ -37,7 +37,7 @@ module {
       %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
       %2 = amdaie.circular_dma_cpy_nd(%0[%c0] [%c1024] [%c1], %1[%c0, %c0, %c0] [%c8, %c32, %c4] [%c4, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
       %3 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_2} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
-      %4 = amdaie.core(%tile_2) {
+      %4 = amdaie.core(%tile_2, in : [%2], out : []) {
         scf.forall (%arg0, %arg1) in (2, 2) {
           %5 = amdaie.logicalobjectfifo.access(%0, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
           %6 = amdaie.logicalobjectfifo.access(%3, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_cores.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_cores.mlir
index aba2deaaa..6a043cfc1 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_cores.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_cores.mlir
@@ -22,10 +22,8 @@ func.func @insert_cores_with_non_normalized_forall() {
 // CHECK:           %[[C2:.*]] = arith.constant 2 : index
 // CHECK:           %[[ADD:.*]] = arith.addi %[[ARG2]], %[[C2]] : index
 // CHECK:           %[[TILE0:.*]] = amdaie.tile(%[[ARG3]], %[[ADD]])
-// CHECK:           %[[CORE0:.*]] = amdaie.core(%[[TILE0]]) {
+// CHECK:           %[[CORE0:.*]] = amdaie.core(%[[TILE0]], in : [%[[DMA_CPY2]], %[[DMA_CPY3]]], out : []) {
 // CHECK:             linalg.fill
-// CHECK:             amdaie.logicalobjectfifo.consume(%[[DMA_CPY2]])
-// CHECK:             amdaie.logicalobjectfifo.consume(%[[DMA_CPY3]])
 // CHECK:             linalg.generic
 // CHECK:             amdaie.end
 // CHECK:           }
@@ -40,12 +38,9 @@ func.func @insert_cores_with_non_normalized_forall() {
 // CHECK:           %[[C2:.*]] = arith.constant 2 : index
 // CHECK:           %[[ADD:.*]] = arith.addi %[[ARG2]], %[[C2]] : index
 // CHECK:           %[[TILE1:.*]] = amdaie.tile(%[[ARG3]], %[[ADD]])
-// CHECK:           %[[CORE1:.*]] = amdaie.core(%[[TILE1]]) {
+// CHECK:           %[[CORE1:.*]] = amdaie.core(%[[TILE1]], in : [%[[DMA_CPY2]], %[[DMA_CPY3]]], out : [%[[DMA_CPY4]]]) {
 // CHECK:             linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<4x8x4x8xi32, 2>)
-// CHECK:             amdaie.logicalobjectfifo.consume(%[[DMA_CPY2]])
-// CHECK:             amdaie.logicalobjectfifo.consume(%[[DMA_CPY3]])
 // CHECK:             linalg.generic
-// CHECK:             amdaie.logicalobjectfifo.produce(%[[DMA_CPY4]])
 // CHECK:             amdaie.end
 // CHECK:           }
 // CHECK:         } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
@@ -142,9 +137,7 @@ module {
 // CHECK:           %[[C2:.*]] = arith.constant 2 : index
 // CHECK:           %[[ADD:.*]] = arith.addi %[[ARG2]], %[[C2]] : index
 // CHECK:           %[[TILE0:.*]] = amdaie.tile(%[[ARG3]], %[[ADD]])
-// CHECK:           amdaie.core(%[[TILE0]]) {
-// CHECK:             amdaie.logicalobjectfifo.consume(%[[DMA_CPY0]])
-// CHECK:             amdaie.logicalobjectfifo.consume(%[[DMA_CPY1]])
+// CHECK:           amdaie.core(%[[TILE0]], in : [%[[DMA_CPY0]], %[[DMA_CPY1]]], out : []) {
 // CHECK:             linalg.fill
 // CHECK:             scf.for
 // CHECK:               scf.for
@@ -226,9 +219,7 @@ module {
 // CHECK:           %[[C2:.*]] = arith.constant 2 : index
 // CHECK:           %[[ADD:.*]] = arith.addi %[[ARG2]], %[[C2]] : index
 // CHECK:           %[[TILE0:.*]] = amdaie.tile(%[[ARG3]], %[[ADD]])
-// CHECK:           amdaie.core(%[[TILE0]]) {
-// CHECK:             amdaie.logicalobjectfifo.consume(%[[DMA_CPY0]])
-// CHECK:             amdaie.logicalobjectfifo.consume(%[[DMA_CPY1]])
+// CHECK:           amdaie.core(%[[TILE0]], in : [%[[DMA_CPY0]], %[[DMA_CPY1]]], out : []) {
 // CHECK:             linalg.fill
 // CHECK:             memref.extract_strided_metadata
 // CHECK:             memref.extract_strided_metadata
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir
index 4c0555334..139d0b5bf 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir
@@ -192,7 +192,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<4x8x4x8xi32, 2>>
       %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo<memref<32x32xi32, 1>>, !amdaie.logicalobjectfifo<memref<4x8x4x8xi32, 2>>)
       %dma1 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo<memref<32x64xi32>>, !amdaie.logicalobjectfifo<memref<32x32xi32, 1>>)
-      %core_0_0 = amdaie.core(%tile_0_2) {
+      %core_0_0 = amdaie.core(%tile_0_2, in : [], out : [%dma0]) {
         %0 = amdaie.logicalobjectfifo.acquire(%dma0, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<32x32xi32, 1>>
         %1 = amdaie.logicalobjectfifo.access(%0, Write) : !amdaie.logicalobjectfifo<memref<32x32xi32, 1>> -> memref<32x32xi32, 1>
         linalg.fill ins(%c0_i32 : i32) outs(%1 : memref<32x32xi32, 1>)
@@ -258,7 +258,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %obj4 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile_0_2} : memref<4x8x4x8xf32, 1> -> !amdaie.logicalobjectfifo<memref<4x8x4x8xf32, 1>>
       %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo<memref<32x32xi32, 2>>, !amdaie.logicalobjectfifo<memref<4x8x4x8xi32, 1>>)
       %dma1 = amdaie.circular_dma_cpy_nd(%obj3[] [] [], %obj4[] [] []) : (!amdaie.logicalobjectfifo<memref<32x32xf32, 2>>, !amdaie.logicalobjectfifo<memref<4x8x4x8xf32, 1>>)
-      %core_0_0 = amdaie.core(%tile_0_2) {
+      %core_0_0 = amdaie.core(%tile_0_2, in : [], out : [%dma0, %dma1]) {
         %0 = amdaie.logicalobjectfifo.acquire(%dma0, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<32x32xi32, 2>>
         %1 = amdaie.logicalobjectfifo.access(%0, Write) : !amdaie.logicalobjectfifo<memref<32x32xi32, 2>> -> memref<32x32xi32, 2>
         %2 = amdaie.logicalobjectfifo.acquire(%dma1, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<32x32xf32, 2>>
@@ -323,11 +323,11 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2, %tile_1_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<4x8x4x8xi32, 2>>
       %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj0[] [] []) : (!amdaie.logicalobjectfifo<memref<32x32xi32, 1>>, !amdaie.logicalobjectfifo<memref<32x64xi32>>)
       %dma1 = amdaie.circular_dma_cpy_nd(%obj2[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo<memref<4x8x4x8xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x32xi32, 1>>)
-      %core_0_2 = amdaie.core(%tile_0_2) {
+      %core_0_2 = amdaie.core(%tile_0_2, in : [%dma1], out : []) {
         %0 = amdaie.logicalobjectfifo.acquire(%dma1, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<32x32xi32, 1>>
         amdaie.end
       }
-      %core_1_2 = amdaie.core(%tile_1_2) {
+      %core_1_2 = amdaie.core(%tile_1_2, in : [%dma1], out : []) {
         %0 = amdaie.logicalobjectfifo.acquire(%dma1, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<32x32xi32, 1>>
         amdaie.end
       }
@@ -372,7 +372,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<4x8x4x8xi32, 2>>
       %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo<memref<32x32xi32, 1>>, !amdaie.logicalobjectfifo<memref<4x8x4x8xi32, 2>>)
       %dma1 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo<memref<32x64xi32>>, !amdaie.logicalobjectfifo<memref<32x32xi32, 1>>)
-      %core_0_0 = amdaie.core(%tile_0_2) {
+      %core_0_0 = amdaie.core(%tile_0_2, in : [], out : [%dma0]) {
         amdaie.logicalobjectfifo.release(%dma0, Produce) {size = 1 : i32}
         amdaie.end
       }
@@ -875,7 +875,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj0[] [] []) : (!amdaie.logicalobjectfifo<memref<32x32xi32, 1>>, !amdaie.logicalobjectfifo<memref<32x64xi32>>)
       %dma1 = amdaie.circular_dma_cpy_nd(%obj2[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo<memref<4x8x4x8xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x32xi32, 1>>)
       amdaie.logicalobjectfifo.link[%dma0] -> [%dma1] ()
-      %core_0_2 = amdaie.core(%tile_0_2) {
+      %core_0_2 = amdaie.core(%tile_0_2, in : [%dma1], out : []) {
         %1 = amdaie.logicalobjectfifo.acquire(%dma1, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<4x8x4x8xi32, 2>>
         %2 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo<memref<4x8x4x8xi32, 2>> -> memref<4x8x4x8xi32, 2>
         scf.for %arg2 = %c0 to %c8 step %c1  {
@@ -884,7 +884,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
         amdaie.logicalobjectfifo.release(%dma1, Consume) {size = 1 : i32}
         amdaie.end
       }
-      %core_1_2 = amdaie.core(%tile_1_2) {
+      %core_1_2 = amdaie.core(%tile_1_2, in : [%dma1], out : []) {
         %1 = amdaie.logicalobjectfifo.acquire(%dma1, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<4x8x4x8xi32, 2>>
         %2 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo<memref<4x8x4x8xi32, 2>> -> memref<4x8x4x8xi32, 2>
         scf.for %arg2 = %c0 to %c8 step %c1  {
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/none_access_to_temporary_buffer.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/none_access_to_temporary_buffer.mlir
index 2780da1b6..502f51f54 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/none_access_to_temporary_buffer.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/none_access_to_temporary_buffer.mlir
@@ -11,7 +11,7 @@ func.func @none_access_to_buffer(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x1
   %c0_i32 = arith.constant 0 : i32
   %c0 = arith.constant 0 : index
   %tile = amdaie.tile(%c0, %c0)
-  %core = amdaie.core(%tile) {
+  %core = amdaie.core(%tile, in : [], out : []) {
     %3 = amdaie.logicalobjectfifo.access(%arg0, None) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
     linalg.fill ins(%c0_i32 : i32) outs(%3 : memref<1x1x8x16xi32, 2>)
     amdaie.end
@@ -43,20 +43,18 @@ func.func @single_none_access_multiple_users(%arg0: !amdaie.logicalobjectfifo<me
   %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
   %1 = amdaie.circular_dma_cpy_nd(%arg3[] [] [], %arg2[] [] []) : (!amdaie.logicalobjectfifo<memref<8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>)
   %2 = amdaie.logicalobjectfifo.from_memref %arg4, {%tile} : memref<1x1x8x16xi32, 2> -> !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>
-  %3 = amdaie.core(%tile) {
+  %3 = amdaie.core(%tile, in : [%0], out : [%1]) {
     %4 = amdaie.logicalobjectfifo.acquire(%0, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>
     %5 = amdaie.logicalobjectfifo.access(%4, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
     %6 = amdaie.logicalobjectfifo.access(%2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
     %7 = amdaie.logicalobjectfifo.acquire(%1, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>
     %8 = amdaie.logicalobjectfifo.access(%7, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
-    amdaie.logicalobjectfifo.consume(%0)
     linalg.fill ins(%c0_i32 : i32) outs(%5 : memref<1x1x8x16xi32, 2>)
     linalg.fill ins(%c0_i32 : i32) outs(%6 : memref<1x1x8x16xi32, 2>)
     linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : memref<1x1x8x16xi32, 2>) outs(%8 : memref<1x1x8x16xi32, 2>) {
     ^bb0(%in: i32, %out: i32):
       linalg.yield %in : i32
     }
-    amdaie.logicalobjectfifo.produce(%1)
     amdaie.logicalobjectfifo.release(%0, Consume) {size = 1 : i32}
     amdaie.logicalobjectfifo.release(%1, Produce) {size = 1 : i32}
     amdaie.end
@@ -97,31 +95,27 @@ func.func @multiple_none_access_multiple_users(%arg0: !amdaie.logicalobjectfifo<
   %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
   %1 = amdaie.circular_dma_cpy_nd(%arg3[] [] [], %arg2[] [] []) : (!amdaie.logicalobjectfifo<memref<8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>)
   %2 = amdaie.logicalobjectfifo.from_memref %arg4, {%tile} : memref<1x1x8x16xi32, 2> -> !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>
-  %3 = amdaie.core(%tile) {
+  %3 = amdaie.core(%tile, in : [%0], out : [%1]) {
     %4 = amdaie.logicalobjectfifo.acquire(%0, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>
     %5 = amdaie.logicalobjectfifo.access(%4, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
     %6 = amdaie.logicalobjectfifo.access(%2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
-    amdaie.logicalobjectfifo.consume(%0)
     linalg.fill ins(%c0_i32 : i32) outs(%5 : memref<1x1x8x16xi32, 2>)
     linalg.fill ins(%c0_i32 : i32) outs(%6 : memref<1x1x8x16xi32, 2>)
     scf.for %arg5 = %c0 to %c8 step %c1 {
       amdaie.logicalobjectfifo.release(%0, Consume) {size = 1 : i32}
       %10 = amdaie.logicalobjectfifo.acquire(%0, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>
       %11 = amdaie.logicalobjectfifo.access(%10, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
-      amdaie.logicalobjectfifo.consume(%0)
       linalg.fill ins(%c0_i32 : i32) outs(%11 : memref<1x1x8x16xi32, 2>)
     }
     amdaie.logicalobjectfifo.release(%0, Consume) {size = 1 : i32}
     %7 = amdaie.logicalobjectfifo.access(%2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
     %8 = amdaie.logicalobjectfifo.acquire(%1, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>
     %9 = amdaie.logicalobjectfifo.access(%8, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2>
-    amdaie.logicalobjectfifo.consume(%0)
     linalg.fill ins(%c0_i32 : i32) outs(%7 : memref<1x1x8x16xi32, 2>)
     linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : memref<1x1x8x16xi32, 2>) outs(%9 : memref<1x1x8x16xi32, 2>) {
     ^bb0(%in: i32, %out: i32):
       linalg.yield %in : i32
     }
-    amdaie.logicalobjectfifo.produce(%1)
     amdaie.logicalobjectfifo.release(%1, Produce) {size = 1 : i32}
     amdaie.end
   }