diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp index 1c6e4866e..309e610fb 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp @@ -52,16 +52,27 @@ LogicalResult ControlCodeOp::verify() { // AMDAIE_CoreOp //===----------------------------------------------------------------------===// + +void CoreOp::build(OpBuilder &b, OperationState &result, AMDAIE::TileOp tileOp, + ValueRange inputDmas, ValueRange outputDmas) { + build(b, result, b.getIndexType(), tileOp, inputDmas, outputDmas, nullptr); +} + /// Hardcoded row_offset == 2 -> AIE core rows start from 2 /// TODO(jornt): avoid hardcoding here. Add a device model/identifier to loop up /// core offset. This will be handled in a follow-up. void CoreOp::build(OpBuilder &b, OperationState &result, Value coreCol, - Value coreRow) { + Value coreRow, ValueRange inputDmas, ValueRange outputDmas) { auto rowOffset = b.create(b.getUnknownLoc(), 2); auto row = b.createOrFold(b.getUnknownLoc(), rowOffset, coreRow); auto tileOp = b.create(b.getUnknownLoc(), coreCol, row); - build(b, result, b.getIndexType(), tileOp, nullptr); + build(b, result, tileOp, inputDmas, outputDmas, nullptr); +} + +void CoreOp::build(OpBuilder &b, OperationState &result, Value coreCol, + Value coreRow) { + build(b, result, coreCol, coreRow, {}, {}); } LogicalResult CoreOp::verify() { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td index 2d5a729b1..5c897859a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td @@ -47,7 +47,7 @@ def AMDAIE_ControlCodeOp : AMDAIE_Op<"controlcode", [HasParent<"WorkgroupOp">, let hasVerifier = 1; } -def AMDAIE_CoreOp: AMDAIE_Op<"core", [SingleBlock]>, Results<(outs Index)> { +def AMDAIE_CoreOp: AMDAIE_Op<"core", [SingleBlock, AttrSizedOperandSegments]>, Results<(outs Index)> { let summary = "The AIE core operator"; let description = [{ This operation represents an AIE core op, containing a sequence of operations @@ -62,15 +62,20 @@ def AMDAIE_CoreOp: AMDAIE_Op<"core", [SingleBlock]>, Results<(outs Index)> { let arguments = ( ins Index:$tile, - OptionalAttr:$link_with + Variadic:$input_dmas, + Variadic:$output_dmas, + OptionalAttr:$link_with ); let regions = (region SizedRegion<1>:$region); - let assemblyFormat = [{ `(` $tile `)` regions attr-dict }]; + let assemblyFormat = [{ `(` $tile `,` `in` `:` `[` $input_dmas `]` `,` `out` `:` `[` $output_dmas `]` `)` regions attr-dict }]; let builders = [ OpBuilder<(ins "mlir::Value":$coreCol, "mlir::Value":$coreRow)>, + OpBuilder<(ins "mlir::Value":$coreCol, "mlir::Value":$coreRow, + "ValueRange":$input_dmas, "ValueRange":$output_dmas)>, + OpBuilder<(ins "TileOp":$tile, "ValueRange":$input_dmas, "ValueRange":$output_dmas)> ]; let extraClassDeclaration = [{ @@ -438,7 +443,7 @@ def AMDAIE_LogicalObjectFifoAccessOp : AMDAIE_Op<"logicalobjectfifo.access"> { %alloc = memref.alloc() : memref<8x16xi32, 2> %0 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<8x16xi32, 2> -> !amdaie.logicalobjectfifo> - %core = amdaie.core(%tile) { + %core = amdaie.core(%tile, in : [], out : []) { %1 = amdaie.logicalobjectfifo.access(%0, Read) : !amdaie.logicalobjectfifo> -> memref<8x16xi32, 2> ``` @@ -511,47 +516,6 @@ def AMDAIE_LogicalObjectFifoAcquire: ]; } -def AMDAIE_LogicalObjectFifoConsume: AMDAIE_Op<"logicalobjectfifo.consume", []> { - let summary = "Consume a DMA logical objectFifo result."; - let description = [{ - Consumes the result of a DMA operation. This is a blocking operation, - waiting for the DMA to produce data. Typically, this operation will reside - inside a `CoreOp` to synchronize with external DMA operations producing data - into the respective core's local memory. - - Example: - ```mlir - %2 = amdaie.dma_cpy_nd( - %1[%c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16] [%c128, %c128, %c16, %c1], - %0[%c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16] [%c128, %c16, %c16, %c1]) - : (!amdaie.logicalobjectfifo>, - !amdaie.logicalobjectfifo>) - %3 = amdaie.logicalobjectfifo.consume(%2) - ``` - }]; - - let arguments = ( - ins Index:$dma - ); - - let assemblyFormat = [{ - `(` $dma `)` attr-dict - }]; - - let extraClassDeclaration = [{ - DmaCpyNdOp getDmaCpyNdOp() { - return dyn_cast(getDma().getDefiningOp()); - } - Value getLogicalObjectfifo() { - return dyn_cast(getDma().getDefiningOp()).getTarget(); - } - // Return the port of this operation. - LogicalObjectFifoPort getPort() { - return LogicalObjectFifoPort::Consume; - } - }]; -} - def AMDAIE_LogicalObjectFifoFromMemrefOp : AMDAIE_Op<"logicalobjectfifo.from_memref", [Pure]> { let summary = "Create a logical objectFifo from a memref"; @@ -654,48 +618,6 @@ def AMDAIE_LogicalObjectFifoLink }]; } -def AMDAIE_LogicalObjectFifoProduce: AMDAIE_Op<"logicalobjectfifo.produce", []> { - let summary = "Produce a DMA logicalobjectfifo input."; - let description = [{ - Produces the input of a DMA operation. This is a release-type operation, - where the DMA will be waiting for the data to be produced. Typically, this - operation will reside inside a `CoreOp` to synchronize with external DMA - operations waiting for data from the respective core's local memory to be - released. - - Example: - ```mlir - %2 = amdaie.dma_cpy_nd( - %1[%c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16] [%c128, %c128, %c16, %c1], - %0[%c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16] [%c128, %c16, %c16, %c1]) - : (!amdaie.logicalobjectfifo>, - !amdaie.logicalobjectfifo>) - %3 = amdaie.logicalobjectfifo.produce(%2) - ``` - }]; - - let arguments = ( - ins Index:$dma - ); - - let assemblyFormat = [{ - `(` $dma `)` attr-dict - }]; - - let extraClassDeclaration = [{ - DmaCpyNdOp getDmaCpyNdOp() { - return dyn_cast(getDma().getDefiningOp()); - } - Value getLogicalObjectfifo() { - return dyn_cast(getDma().getDefiningOp()).getSource(); - } - // Return the port of this operation. - LogicalObjectFifoPort getPort() { - return LogicalObjectFifoPort::Produce; - } - }]; -} - def AMDAIE_LogicalObjectFifoRelease: AMDAIE_Op<"logicalobjectfifo.release", []> { let summary = "Semaphore operation to release objects from a logical" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/invalid.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/invalid.mlir index 195f90adc..589f103bc 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/invalid.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/invalid.mlir @@ -6,7 +6,7 @@ func.func @core_invalid_terminator() { %tile = amdaie.tile(%c0, %c0) // expected-note @+2 {{in custom textual format, the absence of terminator implies 'amdaie.end'}} // expected-error @+1 {{'amdaie.core' op expects regions to end with 'amdaie.end', found 'arith.constant'}} - %core = amdaie.core(%tile) { + %core = amdaie.core(%tile, in : [], out : []) { %c1 = arith.constant 0 : index } return diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir index 82b20c05c..d5d85ede0 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir @@ -16,12 +16,12 @@ func.func @bd_id() { // CHECK-LABEL: func.func @core // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[TILE_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK: %[[CORE_0:.*]] = amdaie.core(%[[TILE_0]]) +// CHECK: %[[CORE_0:.*]] = amdaie.core(%[[TILE_0]], in : [], out : []) // CHECK: amdaie.end func.func @core() { %c0 = arith.constant 0 : index %tile = amdaie.tile(%c0, %c0) - %core = amdaie.core(%tile) { + %core = amdaie.core(%tile, in : [], out : []) { amdaie.end } return @@ -150,17 +150,6 @@ func.func @logicalobjectfifo_acquire(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { - %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.consume(%0) - return -} - -// ----- - // CHECK-LABEL: func.func @logicalobjectfifo_link // CHECK: %[[DMA0:.+]] = amdaie.circular_dma_cpy_nd // CHECK: %[[DMA1:.+]] = amdaie.circular_dma_cpy_nd @@ -176,17 +165,6 @@ func.func @logicalobjectfifo_link(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { - %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.produce(%0) - return -} - -// ----- - // CHECK-LABEL: func.func @logicalobjectfifo_release // CHECK: %[[DMA:.+]] = amdaie.dma_cpy_nd // CHECK: amdaie.logicalobjectfifo.release @@ -300,11 +278,11 @@ func.func @workgroup() { %c1 = arith.constant 1 : index amdaie.workgroup { %tile_0_0 = amdaie.tile(%c0, %c0) - %core_0 = amdaie.core(%tile_0_0) { + %core_0 = amdaie.core(%tile_0_0, in : [], out : []) { amdaie.end } %tile_0_1 = amdaie.tile(%c0, %c1) - %core_1 = amdaie.core(%tile_0_1) { + %core_1 = amdaie.core(%tile_0_1, in : [], out : []) { amdaie.end } amdaie.controlcode { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp index 0b0649eae..cc1888370 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp @@ -18,16 +18,35 @@ namespace mlir::iree_compiler::AMDAIE { -/// Merge the 'source' core operations in the end of the 'dest' core operation. -void CoreContext::mergeCoreOps(AMDAIE::CoreOp source, AMDAIE::CoreOp dest) { +/// Merge the 'source' and 'dest' core operations into a new `amdaie.core` +/// operation and combine the input and output DMAs. +AMDAIE::CoreOp CoreContext::mergeCoreOps(AMDAIE::CoreOp source, + AMDAIE::CoreOp dest) { OpBuilder::InsertionGuard guard(rewriter); - Block::iterator insertIt = dest.getBody()->getTerminator()->getIterator(); - Block::iterator sourceBegin = source.getBody()->begin(); - Block::iterator sourceEnd = source.getBody()->getTerminator()->getIterator(); - dest.getBody()->getOperations().splice( - insertIt, source.getBody()->getOperations(), sourceBegin, sourceEnd); - rewriter.moveOpBefore(dest, source); - rewriter.replaceOp(source, dest); + AMDAIE::TileOp tile = dest.getTileOp(); + SmallVector sourceInputDmas = source.getInputDmas(); + SmallVector destInputDmas = dest.getInputDmas(); + llvm::SmallSetVector inputDmas(destInputDmas.begin(), + destInputDmas.end()); + inputDmas.insert(sourceInputDmas.begin(), sourceInputDmas.end()); + SmallVector sourceOutputDmas = source.getOutputDmas(); + SmallVector destOutputDmas = dest.getOutputDmas(); + llvm::SmallSetVector outputDmas(destOutputDmas.begin(), + destOutputDmas.end()); + outputDmas.insert(sourceOutputDmas.begin(), sourceOutputDmas.end()); + rewriter.setInsertionPoint(source); + auto newCoreOp = rewriter.create(rewriter.getUnknownLoc(), + tile, inputDmas.takeVector(), + outputDmas.takeVector()); + Region ®ion = newCoreOp.getRegion(); + Block *newBlock = rewriter.createBlock(®ion); + rewriter.setInsertionPointToStart(newBlock); + rewriter.eraseOp(dest.getBody()->getTerminator()); + rewriter.mergeBlocks(dest.getBody(), newBlock); + rewriter.mergeBlocks(source.getBody(), newBlock); + rewriter.eraseOp(dest); + rewriter.eraseOp(source); + return newCoreOp; } /// Clone CoreOp and add to or merge with coreContext. diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.h index 445592647..5fddc61b6 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.h @@ -168,7 +168,7 @@ class CoreContext { if (!existingCoreOp) { coreMap[coordinate] = coreOp; } else { - mergeCoreOps(coreOp, existingCoreOp); + coreMap[coordinate] = mergeCoreOps(coreOp, existingCoreOp); } } @@ -179,9 +179,8 @@ class CoreContext { } private: - /// Merge the 'source' core operations in the end of the 'dest' core - /// operation. - void mergeCoreOps(AMDAIE::CoreOp source, AMDAIE::CoreOp dest); + /// Merge the 'source' and 'dest' core operations into a new one. + AMDAIE::CoreOp mergeCoreOps(AMDAIE::CoreOp source, AMDAIE::CoreOp dest); /// The rewriter to be used. IRRewriterAndMapper &rewriter; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp index bfb24ecba..77370de67 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp @@ -527,25 +527,26 @@ LogicalResult insertLogicalObjectFifoAccess(ModuleOp moduleOp) { DenseMap> memrefToLogicalObjectFifo; - // First walk to collect consume/produce DMA accesses and map respective - // memrefs to logical objectifos. - coreOp->walk([&](Operation *op) { - // TODO(jornt): can we avoid produce/consume? - if (auto consumeOp = dyn_cast(op)) { - Value targetMemref = - consumeOp.getDmaCpyNdOp().getTargetObjectFifo().getMemref(); - memrefToLogicalObjectFifo[targetMemref] = - std::make_pair(consumeOp.getDmaCpyNdOp().getTargetObjectFifo(), - AMDAIE::MemoryAccess::Read); - } else if (auto produceOp = - dyn_cast(op)) { - Value sourceMemref = - produceOp.getDmaCpyNdOp().getSourceObjectFifo().getMemref(); + + SmallVector inputDmaOps = + llvm::map_to_vector(coreOp.getInputDmas(), [](Value inputDma) { + return cast(inputDma.getDefiningOp()); + }); + for (AMDAIE::DmaCpyNdOp inputDmaOp : inputDmaOps) { + Value targetMemref = inputDmaOp.getTargetObjectFifo().getMemref(); + memrefToLogicalObjectFifo[targetMemref] = std::make_pair( + inputDmaOp.getTargetObjectFifo(), AMDAIE::MemoryAccess::Read); + } + SmallVector outputDmaOps = + llvm::map_to_vector(coreOp.getOutputDmas(), [](Value outputDma) { + return cast(outputDma.getDefiningOp()); + }); + for (AMDAIE::DmaCpyNdOp outputDmaOp : outputDmaOps) { + Value sourceMemref = outputDmaOp.getSourceObjectFifo().getMemref(); memrefToLogicalObjectFifo[sourceMemref] = - std::make_pair(produceOp.getDmaCpyNdOp().getSourceObjectFifo(), + std::make_pair(outputDmaOp.getSourceObjectFifo(), AMDAIE::MemoryAccess::Write); - } - }); + } // We maintain a map from AllocOp to LogicalObjectFifoAccessOp in order to // avoid creating a new LogicalObjectFifoAccessOp for the same AllocOp being diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertCores.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertCores.cpp index 33d1ffbf1..25cc85c2a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertCores.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertCores.cpp @@ -71,45 +71,43 @@ LogicalResult insertCoreOps(mlir::ModuleOp moduleOp) { } Value threadX = attrMapping[gpu::threadX(forallOp->getContext())]; Value threadY = attrMapping[gpu::threadY(forallOp->getContext())]; + + // Find input and output DMAs that need to be added to the core. + SmallVector inputDmas; + SmallVector outputDmas; + WalkResult dmaRes = forallOp->walk([&](AMDAIE::DmaCpyNdOp dmaOp) { + uint8_t sourceMemspace = + dmaOp.getSourceObjectFifo().getMemorySpaceAsUInt(); + uint8_t targetMemspace = + dmaOp.getTargetObjectFifo().getMemorySpaceAsUInt(); + if (sourceMemspace == 2 && targetMemspace == 2) { + dmaOp->emitOpError() + << "dma op with both source and target on L1 is not supported"; + return WalkResult::interrupt(); + } else if (sourceMemspace == 2) { + outputDmas.push_back(dmaOp); + } else if (targetMemspace == 2) { + inputDmas.push_back(dmaOp); + } + return WalkResult::advance(); + }); + if (dmaRes.wasInterrupted()) return WalkResult::interrupt(); + // Create CoreOp at the end of the innermost forall rewriter.setInsertionPoint(forallOp.getBody()->getTerminator()); - auto coreOp = rewriter.create(rewriter.getUnknownLoc(), - threadX, threadY); + auto coreOp = rewriter.create( + rewriter.getUnknownLoc(), threadX, threadY, inputDmas, outputDmas); Region ®ion = coreOp.getRegion(); Block *newBlock = rewriter.createBlock(®ion); rewriter.setInsertionPointToStart(newBlock); auto endOp = rewriter.create(rewriter.getUnknownLoc()); // Walk all operations in the workgroup and fill in the CoreOp with - // computational ops (linalg) and synchronization ops to synchronize - // with the workgroup DMA ops. + // computational ops. WalkResult forallRes = forallOp->walk([&](Operation *op) { // Skip operations already inside core ops if (op->getParentOfType()) return WalkResult::advance(); - if (auto dmaOp = dyn_cast(op)) { - auto sourceMemspace = dmaOp.getSourceObjectFifo().getMemorySpace(); - auto targetMemspace = dmaOp.getTargetObjectFifo().getMemorySpace(); - if (sourceMemspace && - dyn_cast(sourceMemspace).getInt() == 2 && - targetMemspace && - dyn_cast(targetMemspace).getInt() == 2) { - dmaOp->emitOpError() - << "dma op with both source and target on L1 is not supported"; - return WalkResult::interrupt(); - } else if (sourceMemspace && - dyn_cast(sourceMemspace).getInt() == 2) { - // From L1, so insert a logical objectFifo produce op - rewriter.setInsertionPoint(endOp); - rewriter.create( - rewriter.getUnknownLoc(), SmallVector{}, dmaOp); - } else if (targetMemspace && - dyn_cast(targetMemspace).getInt() == 2) { - // To L1, so insert a logical objectFifo consume op - rewriter.setInsertionPoint(endOp); - rewriter.create( - rewriter.getUnknownLoc(), SmallVector{}, dmaOp); - } - } else if (auto linalgOp = dyn_cast(op)) { + if (auto linalgOp = dyn_cast(op)) { rewriter.setInsertionPoint(endOp); rewriter.moveOpBefore(linalgOp, endOp); } else if (isa(op)) { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp index 8c1f21e48..da63935e3 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp @@ -363,16 +363,6 @@ LogicalResult coreToAIE(IRRewriter &rewriter, AMDAIE::CoreOp coreOp, .Case([&](auto acquireOp) { return acquireOpToAIE(rewriter, acquireOp, mapper, toBeErased); }) - .Case([&](auto consumeOp) { - // TODO(jornt): get rid of LogicalObjectFifoConsume before this - rewriter.eraseOp(consumeOp); - return success(); - }) - .Case([&](auto produceOp) { - // TODO(jornt): get rid of LogicalObjectFifoProduce before this - rewriter.eraseOp(produceOp); - return success(); - }) .Case([&](auto releaseOp) { return coreReleaseOpToAIE(rewriter, releaseOp, mapper, toBeErased); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir index afdd2fb4e..3dce5d5bb 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir @@ -12,9 +12,8 @@ func.func @read_access(%arg0: !amdaie.logicalobjectfifo> %c0 = arith.constant 0 : index %tile = amdaie.tile(%c0, %c0) %2 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core = amdaie.core(%tile) { + %core = amdaie.core(%tile, in : [%2], out : []) { %3 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> - amdaie.logicalobjectfifo.consume(%2) linalg.fill ins(%c0_i32 : i32) outs(%3 : memref<1x1x8x16xi32, 2>) amdaie.end } @@ -35,10 +34,9 @@ func.func @write_access(%arg0: !amdaie.logicalobjectfifo %c0_i32 = arith.constant 0 : i32 %tile = amdaie.tile(%c0, %c0) %2 = amdaie.circular_dma_cpy_nd(%arg1[] [] [], %arg0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core = amdaie.core(%tile) { + %core = amdaie.core(%tile, in : [], out : [%2]) { %3 = amdaie.logicalobjectfifo.access(%arg0, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> linalg.fill ins(%c0_i32 : i32) outs(%3 : memref<1x1x8x16xi32, 2>) - amdaie.logicalobjectfifo.produce(%2) amdaie.end } return @@ -55,7 +53,7 @@ func.func @none_access(%arg0: !amdaie.logicalobjectfifo> %c0_i32 = arith.constant 0 : i32 %c0 = arith.constant 0 : index %tile = amdaie.tile(%c0, %c0) - %core = amdaie.core(%tile) { + %core = amdaie.core(%tile, in : [], out : []) { %3 = amdaie.logicalobjectfifo.access(%arg0, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> linalg.fill ins(%c0_i32 : i32) outs(%3 : memref<1x1x8x16xi32, 2>) amdaie.end @@ -74,7 +72,7 @@ func.func @any_access(%arg0: !amdaie.logicalobjectfifo>) %c0_i32 = arith.constant 0 : i32 %c0 = arith.constant 0 : index %tile = amdaie.tile(%c0, %c0) - %core = amdaie.core(%tile) { + %core = amdaie.core(%tile, in : [], out : []) { %3 = amdaie.logicalobjectfifo.access(%arg0, Any) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> linalg.fill ins(%c0_i32 : i32) outs(%3 : memref<1x1x8x16xi32, 2>) amdaie.end @@ -102,13 +100,11 @@ func.func @read_and_write(%arg0: !amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %3 = amdaie.circular_dma_cpy_nd(%arg3[] [] [], %arg2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core = amdaie.core(%tile) { + %core = amdaie.core(%tile, in : [%2], out : [%3]) { %4 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> %5 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> - amdaie.logicalobjectfifo.consume(%2) linalg.fill ins(%c0_i32 : i32) outs(%4 : memref<1x1x8x16xi32, 2>) linalg.fill ins(%c0_i32 : i32) outs(%5 : memref<1x1x8x16xi32, 2>) - amdaie.logicalobjectfifo.produce(%3) amdaie.end } return @@ -146,21 +142,17 @@ func.func @read_write_multiple_blocks(%arg0: !amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %3 = amdaie.circular_dma_cpy_nd(%arg3[] [] [], %arg2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core = amdaie.core(%tile) { + %core = amdaie.core(%tile, in : [%2], out : [%3]) { %4 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> - amdaie.logicalobjectfifo.consume(%2) linalg.fill ins(%c0_i32 : i32) outs(%4 : memref<1x1x8x16xi32, 2>) scf.for %arg = %c0 to %c8 step %c1 { %5 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> - amdaie.logicalobjectfifo.consume(%2) linalg.fill ins(%c0_i32 : i32) outs(%5 : memref<1x1x8x16xi32, 2>) } %6 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> %7 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> - amdaie.logicalobjectfifo.consume(%2) linalg.fill ins(%c0_i32 : i32) outs(%6 : memref<1x1x8x16xi32, 2>) linalg.fill ins(%c0_i32 : i32) outs(%7 : memref<1x1x8x16xi32, 2>) - amdaie.logicalobjectfifo.produce(%3) amdaie.end } return @@ -187,11 +179,9 @@ func.func @multiple_reads_deterministic_order(%arg0: !amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %3 = amdaie.circular_dma_cpy_nd(%arg2[] [] [], %arg3[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core = amdaie.core(%tile) { + %core = amdaie.core(%tile, in : [%2, %3], out : []) { %4 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> %5 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> - amdaie.logicalobjectfifo.consume(%2) - amdaie.logicalobjectfifo.consume(%3) linalg.fill ins(%c0_i32 : i32) outs(%4 : memref<1x1x8x16xi32, 2>) linalg.fill ins(%c0_i32 : i32) outs(%5 : memref<1x1x8x16xi32, 2>) amdaie.end @@ -220,13 +210,11 @@ func.func @multiple_writes_deterministic_order(%arg0: !amdaie.logicalobjectfifo< %tile = amdaie.tile(%c0, %c0) %2 = amdaie.circular_dma_cpy_nd(%arg1[] [] [], %arg0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %3 = amdaie.circular_dma_cpy_nd(%arg3[] [] [], %arg2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core = amdaie.core(%tile) { + %core = amdaie.core(%tile, in : [], out : [%2, %3]) { %4 = amdaie.logicalobjectfifo.access(%arg0, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> %5 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> linalg.fill ins(%c0_i32 : i32) outs(%4 : memref<1x1x8x16xi32, 2>) linalg.fill ins(%c0_i32 : i32) outs(%5 : memref<1x1x8x16xi32, 2>) - amdaie.logicalobjectfifo.produce(%2) - amdaie.logicalobjectfifo.produce(%3) amdaie.end } return diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_doubly_strided_op.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_doubly_strided_op.mlir index 77950fd7b..951f9f6d8 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_doubly_strided_op.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_doubly_strided_op.mlir @@ -1,5 +1,5 @@ -// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-amdaie-canonicalize-doubly-strided-op,canonicalize))" %s | FileCheck %s -// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=true},canonicalize))" %s | FileCheck %s --check-prefix=FOLD-SINGLE-DIMS +// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-amdaie-canonicalize-doubly-strided-op,canonicalize))" -allow-unregistered-dialect %s | FileCheck %s +// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=true},canonicalize))" -allow-unregistered-dialect %s | FileCheck %s --check-prefix=FOLD-SINGLE-DIMS // Verify that source and target of `amdaie.circular_dma_cpy_nd` is still correct after canonicalization. // @@ -18,7 +18,7 @@ // FOLD-SINGLE-DIMS: amdaie.circular_dma_cpy_nd(%[[ARG0]][] [] [], %[[ARG1]][] [] []) func.func @circular_dma_cpy_nd_source_target(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.circular_dma_cpy_nd(%arg0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[0, 0, 0, 0] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.consume(%0) + "iree.keep"(%0) : (index) -> () return } @@ -33,7 +33,7 @@ func.func @circular_dma_cpy_nd_source_target(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.circular_dma_cpy_nd(%arg0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[0, 0, 0, 0] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.consume(%0) + "iree.keep"(%0) : (index) -> () return } @@ -57,7 +57,7 @@ func.func @circular_dma_cpy_nd_linear_implicit(%arg0: !amdaie.logicalobjectfifo< func.func @circular_dma_cpy_nd_linear(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %c16 = arith.constant 16 : index %0 = amdaie.circular_dma_cpy_nd(%arg0[0, 0, 0, 0] [1, 2, 8, 8] [256, 128, %c16, 1], %arg1[0, 0, 0, 0] [64, 16, 8, %c16] [128, %c16, %c16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.consume(%0) + "iree.keep"(%0) : (index) -> () return } @@ -68,7 +68,7 @@ func.func @circular_dma_cpy_nd_linear(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.circular_dma_cpy_nd(%arg0[0, 0, 0, 0] [2, 2, 8, 8] [256, 64, 16, 1], %arg1[0, 0, 0, 0] [2, 2, 8, 16] [128, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.consume(%0) + "iree.keep"(%0) : (index) -> () return } @@ -90,7 +90,7 @@ func.func @circular_dma_cpy_nd_no_linear(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.circular_dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 2, 2, 4, 8] [128, 128, 64, 32, 8, 1], %arg1[0, 0, 0, 0, 0, 0] [1, 1, 2, 2, 4, 8] [128, 128, 8, 64, 16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.consume(%0) + "iree.keep"(%0) : (index) -> () return } @@ -104,7 +104,7 @@ func.func @circular_dma_cpy_nd_unit(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.circular_dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 2, 2, 4, 1, 8] [128, 64, 32, 8, 8, 1], %arg1[0, 0, 0, 0, 0, 0] [2, 2, 1, 4, 8, 1] [64, 32, 32, 8, 1, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.consume(%0) + "iree.keep"(%0) : (index) -> () return } @@ -115,7 +115,7 @@ func.func @circular_dma_cpy_nd_unit_between_linear(%arg0: !amdaie.logicalobjectf // FOLD-SINGLE-DIMS: amdaie.circular_dma_cpy_nd(%{{.+}}[1, 1, 1, 1] [1, 1, 8, 16] [128, 128, 16, 1], %{{.+}}[1, 1, 1, 1] [1, 4, 2, 8] [64, 16, 8, 1]) func.func @circular_dma_cpy_nd_non_zero_offset(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.circular_dma_cpy_nd(%arg0[1, 1, 1, 1] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[1, 1, 1, 1] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.consume(%0) + "iree.keep"(%0) : (index) -> () return } @@ -132,7 +132,7 @@ func.func @circular_dma_cpy_nd_non_zero_offset(%arg0: !amdaie.logicalobjectfifo< // FOLD-SINGLE-DIMS: amdaie.circular_dma_cpy_nd(%{{.+}}[%[[C1]]] [%[[C128]]] [%[[C1]]], %{{.+}}[%[[C1]]] [%[[C64]]] [%[[C1]]]) func.func @circular_dma_cpy_nd_partial_non_zero_offset(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.circular_dma_cpy_nd(%arg0[0, 0, 0, 1] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[0, 0, 0, 1] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.consume(%0) + "iree.keep"(%0) : (index) -> () return } @@ -155,7 +155,7 @@ func.func @circular_dma_cpy_nd_partial_non_zero_offset(%arg0: !amdaie.logicalobj // FOLD-SINGLE-DIMS: amdaie.dma_cpy_nd(%[[ARG0]][] [] [], %[[ARG1]][] [] []) func.func @dma_cpy_nd_source_target(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[0, 0, 0, 0] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.consume(%0) + "iree.keep"(%0) : (index) -> () return } @@ -170,7 +170,7 @@ func.func @dma_cpy_nd_source_target(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[0, 0, 0, 0] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.consume(%0) + "iree.keep"(%0) : (index) -> () return } @@ -194,7 +194,7 @@ func.func @dma_cpy_nd_linear_implicit(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %c16 = arith.constant 16 : index %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0] [1, 2, 8, 8] [256, 128, %c16, 1], %arg1[0, 0, 0, 0] [64, 16, 8, %c16] [128, %c16, %c16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.consume(%0) + "iree.keep"(%0) : (index) -> () return } @@ -205,7 +205,7 @@ func.func @dma_cpy_nd_linear(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0] [2, 2, 8, 8] [256, 64, 16, 1], %arg1[0, 0, 0, 0] [2, 2, 8, 16] [128, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.consume(%0) + "iree.keep"(%0) : (index) -> () return } @@ -227,7 +227,7 @@ func.func @dma_cpy_nd_no_linear(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 2, 2, 4, 8] [128, 128, 64, 32, 8, 1], %arg1[0, 0, 0, 0, 0, 0] [1, 1, 2, 2, 4, 8] [128, 128, 8, 64, 16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.consume(%0) + "iree.keep"(%0) : (index) -> () return } @@ -241,7 +241,7 @@ func.func @dma_cpy_nd_unit(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [2, 2, 1, 1, 4, 8] [64, 32, 32, 32, 8, 1], %arg1[0, 0, 0, 0, 0, 0] [2, 1, 2, 1, 4, 8] [64, 64, 32, 32, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.consume(%0) + "iree.keep"(%0) : (index) -> () return } @@ -252,7 +252,7 @@ func.func @dma_cpy_nd_unit_between_linear(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.dma_cpy_nd(%arg0[1, 1, 1, 1] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[1, 1, 1, 1] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.consume(%0) + "iree.keep"(%0) : (index) -> () return } @@ -269,7 +269,7 @@ func.func @dma_cpy_nd_non_zero_offset(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 1] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[0, 0, 0, 1] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.consume(%0) + "iree.keep"(%0) : (index) -> () return } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_aie_workgroup.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_aie_workgroup.mlir index b3976a2ab..eb28060b2 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_aie_workgroup.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_aie_workgroup.mlir @@ -40,18 +40,18 @@ func.func @circular_dma_cpy_nd(%arg0: !amdaie.logicalobjectfifo, %arg1: memref<8x16xi32, 1>) // CHECK-SAME: %[[FROMMEMREF0]][] [] [] // CHECK-SAME: %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] // CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: %{{.+}} = amdaie.core(%[[TILE_0]]) -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA]]) +// CHECK-DAG: %{{.+}} = amdaie.core(%[[TILE_0]], in : [], out : []) +// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF0]], Read) // CHECK: scf.for %{{.*}} = %[[C0]] to %[[C8]] step %[[C1]] -// CHECK-DAG: %{{.+}} = amdaie.core(%[[TILE_1]]) -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA]]) +// CHECK-DAG: %{{.+}} = amdaie.core(%[[TILE_1]], in : [], out : []) +// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF0]], Read) // CHECK: scf.for %{{.*}} = %[[C0]] to %[[C8]] step %[[C1]] // CHECK: amdaie.controlcode // CHECK-DAG: %[[C0_1:.+]] = arith.constant 0 : index @@ -375,19 +375,19 @@ func.func @merge_cores(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 1>) %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %arg1, {} : memref<8x16xi32, 1> -> !amdaie.logicalobjectfifo> %2 = amdaie.dma_cpy_nd(%0[] [] [], %1[0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core_0_0_0 = amdaie.core(%tile_0_0) { - amdaie.logicalobjectfifo.consume(%2) + %core_0_0_0 = amdaie.core(%tile_0_0, in : [], out : []) { + amdaie.logicalobjectfifo.access(%0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32> amdaie.end } - %core_0_1_0 = amdaie.core(%tile_0_1) { - amdaie.logicalobjectfifo.consume(%2) + %core_0_1_0 = amdaie.core(%tile_0_1, in : [], out : []) { + amdaie.logicalobjectfifo.access(%0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32> amdaie.end } scf.for %arg2 = %c0 to %c8 step %c1 { - %core_0_0_1 = amdaie.core(%tile_0_0) { + %core_0_0_1 = amdaie.core(%tile_0_0, in : [], out : []) { amdaie.end } - %core_0_1_1 = amdaie.core(%tile_0_1) { + %core_0_1_1 = amdaie.core(%tile_0_1, in : [], out : []) { amdaie.end } } @@ -427,15 +427,15 @@ func.func @merge_cores(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 1>) // CHECK-SAME: %[[FROMMEMREF4]][] [] [] // CHECK-SAME: %[[FROMMEMREF5]][0, 0, 0, 0] [1, 1, 32, 16] [128, 16, 8, 1] // CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: %{{.+}} = amdaie.core(%[[TILE_0]]) -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA0]]) +// CHECK-DAG: %{{.+}} = amdaie.core(%[[TILE_0]], in : [], out : []) +// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF0]], Read) // CHECK: scf.for %{{.*}} = %[[C0]] to %[[C8]] step %[[C1]] -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA1]]) +// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF2]], Read) // CHECK: linalg.fill -// CHECK-DAG: %{{.+}} = amdaie.core(%[[TILE_1]]) -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA0]]) +// CHECK-DAG: %{{.+}} = amdaie.core(%[[TILE_1]], in : [], out : []) +// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF0]], Read) // CHECK: scf.for %{{.*}} = %[[C0]] to %[[C8]] step %[[C1]] -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA2]]) +// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF4]], Read) // CHECK: linalg.fill // CHECK: amdaie.controlcode // CHECK-DAG: %[[C0_1:.+]] = arith.constant 0 : index @@ -468,24 +468,24 @@ func.func @complex_example(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, %4 = amdaie.logicalobjectfifo.from_memref %arg4, {} : memref<1x1x32x16xi32> -> !amdaie.logicalobjectfifo> %5 = amdaie.logicalobjectfifo.from_memref %arg5, {} : memref<32x16xi32, 1> -> !amdaie.logicalobjectfifo> %dma_0 = amdaie.dma_cpy_nd(%0[] [] [], %1[0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core_0_0_0 = amdaie.core(%tile_0_0) { - amdaie.logicalobjectfifo.consume(%dma_0) + %core_0_0_0 = amdaie.core(%tile_0_0, in : [], out : []) { + amdaie.logicalobjectfifo.access(%0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32> amdaie.end } - %core_0_1_0 = amdaie.core(%tile_0_1) { - amdaie.logicalobjectfifo.consume(%dma_0) + %core_0_1_0 = amdaie.core(%tile_0_1, in : [], out : []) { + amdaie.logicalobjectfifo.access(%0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32> amdaie.end } scf.for %iv0 = %c0 to %c8 step %c1 { %dma_1 = amdaie.dma_cpy_nd(%2[] [] [], %3[0, 0, 0, 0] [1, 1, 16, 16] [128, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %dma_2 = amdaie.dma_cpy_nd(%4[] [] [], %5[0, 0, 0, 0] [1, 1, 32, 16] [128, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core_0_0_1 = amdaie.core(%tile_0_0) { - amdaie.logicalobjectfifo.consume(%dma_1) + %core_0_0_1 = amdaie.core(%tile_0_0, in : [], out : []) { + amdaie.logicalobjectfifo.access(%2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x16x16xi32> linalg.fill ins(%c0_i32 : i32) outs(%arg2 : memref<1x1x16x16xi32>) amdaie.end } - %core_0_1_1 = amdaie.core(%tile_0_1) { - amdaie.logicalobjectfifo.consume(%dma_2) + %core_0_1_1 = amdaie.core(%tile_0_1, in : [], out : []) { + amdaie.logicalobjectfifo.access(%4, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x32x16xi32> linalg.fill ins(%c0_i32 : i32) outs(%arg4 : memref<1x1x32x16xi32>) amdaie.end } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir index db150b117..3e1a28e8c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir @@ -11,20 +11,20 @@ // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index // CHECK: scf.forall (%[[ARG0:.*]], %[[ARG1:.*]]) in (1, 1) { // CHECK: %[[TILE_0:.*]] = amdaie.tile(%[[C0]], %[[C2]]) -// CHECK: %{{.*}} = amdaie.core(%[[TILE_0]]) +// CHECK: %{{.*}} = amdaie.core(%[[TILE_0]], in : [], out : []) // CHECK: %[[TILE_1:.*]] = amdaie.tile(%[[C1]], %[[C2]]) -// CHECK: %{{.*}} = amdaie.core(%[[TILE_1]]) +// CHECK: %{{.*}} = amdaie.core(%[[TILE_1]], in : [], out : []) // CHECK: %[[TILE_2:.*]] = amdaie.tile(%[[C2]], %[[C2]]) -// CHECK: %{{.*}} = amdaie.core(%[[TILE_2]]) +// CHECK: %{{.*}} = amdaie.core(%[[TILE_2]], in : [], out : []) // CHECK: %[[TILE_3:.*]] = amdaie.tile(%[[C3]], %[[C2]]) -// CHECK: %{{.*}} = amdaie.core(%[[TILE_3]]) +// CHECK: %{{.*}} = amdaie.core(%[[TILE_3]], in : [], out : []) module { func.func @distribute_cores_and_objectfifos_1x4() { %c2 = arith.constant 2 : index scf.forall (%arg0, %arg1) in (1, 1) { scf.forall (%arg2, %arg3) in (1, 4) { %tile = amdaie.tile(%arg3, %c2) - %21 = amdaie.core(%tile) { + %21 = amdaie.core(%tile, in : [], out : []) { amdaie.end } } {mapping = [#gpu.thread, #gpu.thread]} @@ -43,19 +43,19 @@ module { // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index // CHECK: scf.forall // CHECK-DAG: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK-DAG: %[[CORE_0_0:.*]] = amdaie.core(%[[TILE_0_0]]) +// CHECK-DAG: %[[CORE_0_0:.*]] = amdaie.core(%[[TILE_0_0]], in : [], out : []) // CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) -// CHECK-DAG: %[[CORE_0_1:.*]] = amdaie.core(%[[TILE_0_1]]) +// CHECK-DAG: %[[CORE_0_1:.*]] = amdaie.core(%[[TILE_0_1]], in : [], out : []) // CHECK-DAG: %[[TILE_1_0:.*]] = amdaie.tile(%[[C1]], %[[C0]]) -// CHECK-DAG: %[[CORE_1_0:.*]] = amdaie.core(%[[TILE_1_0]]) +// CHECK-DAG: %[[CORE_1_0:.*]] = amdaie.core(%[[TILE_1_0]], in : [], out : []) // CHECK-DAG: %[[TILE_1_1:.*]] = amdaie.tile(%[[C1]], %[[C1]]) -// CHECK-DAG: %[[CORE_1_1:.*]] = amdaie.core(%[[TILE_1_1]]) +// CHECK-DAG: %[[CORE_1_1:.*]] = amdaie.core(%[[TILE_1_1]], in : [], out : []) module { func.func @distribute_cores_and_objectfifos_2x2() { scf.forall (%arg0, %arg1) in (1, 1) { scf.forall (%arg2, %arg3) in (2, 2) { %tile = amdaie.tile(%arg3, %arg2) - %0 = amdaie.core(%tile) { + %0 = amdaie.core(%tile, in : [], out : []) { amdaie.end } } {mapping = [#gpu.thread, #gpu.thread]} @@ -85,14 +85,12 @@ module { // CHECK-DAG: %[[FROM_MEMREF_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_2]]} // CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]] // CHECK-SAME: %[[FROM_MEMREF_0]] -// CHECK-DAG: %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]]) -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) +// CHECK-DAG: %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]], in : [%[[DMA_0]]], out : []) // CHECK: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read) // CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>) // CHECK-DAG: %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]] // CHECK-SAME: %[[FROM_MEMREF_0]] -// CHECK-DAG: %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]]) -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) +// CHECK-DAG: %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]], in : [%[[DMA_1]]], out : []) // CHECK: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) // CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>) module { @@ -107,8 +105,7 @@ module { scf.forall (%arg2, %arg3) in (1, 2) { %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[%arg3, %arg3] [%arg3, %arg3] [%arg3, %arg3]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %tile = amdaie.tile(%arg3, %c2) - %3 = amdaie.core(%tile) { - amdaie.logicalobjectfifo.consume(%2) + %3 = amdaie.core(%tile, in : [%2], out : []) { linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>) amdaie.end } @@ -140,12 +137,10 @@ module { // CHECK-DAG: %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]], %[[TILE_1_2]]} // CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]] // CHECK-SAME: %[[FROM_MEMREF_0]] -// CHECK-DAG: %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]]) -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) +// CHECK-DAG: %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]], in : [%[[DMA_0]]], out : []) // CHECK: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) // CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>) -// CHECK-DAG: %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]]) -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) +// CHECK-DAG: %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]], in : [%[[DMA_0]]], out : []) // CHECK: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) // CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>) module { @@ -160,8 +155,7 @@ module { scf.forall (%arg2, %arg3) in (1, 2) { %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %tile = amdaie.tile(%arg3, %c2) - %3 = amdaie.core(%tile) { - amdaie.logicalobjectfifo.consume(%2) + %3 = amdaie.core(%tile, in : [%2], out : []) { linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>) amdaie.end } @@ -198,9 +192,9 @@ module { // CHECK-DAG: %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]], %[[TILE_0_3]]} // CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]] // CHECK-NOT: amdaie.dma_cpy_nd -// CHECK-DAG: amdaie.core(%[[TILE_0_2]]) +// CHECK-DAG: amdaie.core(%[[TILE_0_2]], in : [%[[DMA_0]]], out : []) // CHECK: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) -// CHECK-DAG: amdaie.core(%[[TILE_0_3]]) +// CHECK-DAG: amdaie.core(%[[TILE_0_3]], in : [%[[DMA_0]]], out : []) // CHECK: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) #map = affine_map<(d0) -> (d0 * 32)> module { @@ -217,8 +211,7 @@ module { %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[%apply] [%c2] [%c2]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %add = arith.addi %arg2, %c2 : index %tile = amdaie.tile(%arg3, %add) - %3 = amdaie.core(%tile) { - amdaie.logicalobjectfifo.consume(%2) + %3 = amdaie.core(%tile, in : [%2], out : []) { linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>) amdaie.end } @@ -252,12 +245,12 @@ module { // CHECK-DAG: %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]} // CHECK-DAG: %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_3]]} // CHECK-DAG: %[[FROM_MEMREF_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]]} -// CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]] -// CHECK-DAG: amdaie.core(%[[TILE_0_2]]) -// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) -// CHECK-DAG: %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]] -// CHECK-DAG: amdaie.core(%[[TILE_0_3]]) +// CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]] +// CHECK-DAG: amdaie.core(%[[TILE_0_2]], in : [%[[DMA_0]]], out : []) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read) +// CHECK-DAG: %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]] +// CHECK-DAG: amdaie.core(%[[TILE_0_3]], in : [%[[DMA_1]]], out : []) +// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) #map = affine_map<(d0) -> (d0 * 32)> module { func.func @unroll_dma_and_affine_single_loop() { @@ -273,8 +266,7 @@ module { %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[%apply] [%c2] [%c2]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %add = arith.addi %arg2, %c2 : index %tile = amdaie.tile(%arg3, %add) - %3 = amdaie.core(%tile) { - amdaie.logicalobjectfifo.consume(%2) + %3 = amdaie.core(%tile, in : [%2], out : []) { linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>) amdaie.end } @@ -309,18 +301,14 @@ module { // CHECK-DAG: %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]], %[[TILE_0_3]], %[[TILE_1_2]], %[[TILE_1_3]]} // CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]] // CHECK-SAME: %[[FROM_MEMREF_0]] -// CHECK-DAG: %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]]) +// CHECK-DAG: %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]], in : [%[[DMA_0]]], out : []) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) -// CHECK-DAG: %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]]) +// CHECK-DAG: %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]], in : [%[[DMA_0]]], out : []) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) -// CHECK-DAG: %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]]) +// CHECK-DAG: %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]], in : [%[[DMA_0]]], out : []) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) -// CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]]) +// CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]], in : [%[[DMA_0]]], out : []) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) module { func.func @hoist_dma_multi_loop() { %c0_i32 = arith.constant 0 : i32 @@ -334,8 +322,7 @@ module { %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %add = arith.addi %arg2, %c2 : index %tile = amdaie.tile(%arg3, %add) - %3 = amdaie.core(%tile) { - amdaie.logicalobjectfifo.consume(%2) + %3 = amdaie.core(%tile, in : [%2], out : []) { linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>) amdaie.end } @@ -374,14 +361,14 @@ module { // CHECK-SAME: %[[FROM_MEMREF_0]] // CHECK-DAG: %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]] // CHECK-SAME: %[[FROM_MEMREF_0]] -// CHECK-DAG: %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) -// CHECK-DAG: %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) -// CHECK-DAG: %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) -// CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) +// CHECK-DAG: %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]], in : [%[[DMA_0]]], out : []) +// CHECK-DAG: amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read) +// CHECK-DAG: %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]], in : [%[[DMA_0]]], out : []) +// CHECK-DAG: amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read) +// CHECK-DAG: %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]], in : [%[[DMA_1]]], out : []) +// CHECK-DAG: amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) +// CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]], in : [%[[DMA_1]]], out : []) +// CHECK-DAG: amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) module { func.func @hoist_dma_one_of_multi_loop() { %c0_i32 = arith.constant 0 : i32 @@ -395,8 +382,7 @@ module { %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[%arg3] [%arg3] [%arg3]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %add = arith.addi %arg2, %c2 : index %tile = amdaie.tile(%arg3, %add) - %3 = amdaie.core(%tile) { - amdaie.logicalobjectfifo.consume(%2) + %3 = amdaie.core(%tile, in : [%2], out : []) { linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>) amdaie.end } @@ -445,21 +431,17 @@ module { // CHECK-SAME: %[[FROM_MEMREF_0]] // CHECK-DAG: %[[DMA_3:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]] // CHECK-SAME: %[[FROM_MEMREF_1]] -// CHECK-DAG: %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]]) +// CHECK-DAG: %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]], in : [%[[DMA_1]]], out : []) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_4]], Read) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) // CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>) -// CHECK-DAG: %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]]) +// CHECK-DAG: %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]], in : [%[[DMA_3]]], out : []) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_3]]) // CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>) -// CHECK-DAG: %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]]) +// CHECK-DAG: %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]], in : [%[[DMA_1]]], out : []) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_4]], Read) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) // CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>) -// CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]]) +// CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]], in : [%[[DMA_3]]], out : []) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_3]]) // CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>) module { func.func @hoist_dma_dependencies() { @@ -477,8 +459,7 @@ module { %4 = amdaie.dma_cpy_nd(%2[] [] [], %1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %add = arith.addi %arg2, %c2 : index %tile = amdaie.tile(%arg3, %add) - %core = amdaie.core(%tile) { - amdaie.logicalobjectfifo.consume(%4) + %core = amdaie.core(%tile, in : [%4], out : []) { linalg.fill ins(%c0_i32 : i32) outs(%alloc_2 : memref<32x64xi32, 2>) amdaie.end } @@ -526,35 +507,31 @@ module { // CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]][] [] [], %[[FROM_MEMREF_0]][%[[ARG1]]] // CHECK-DAG: %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]][] [] [], %[[FROM_MEMREF_1]] // CHECK-DAG: %[[DMA_2:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c0, %c0] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_4]] -// CHECK: %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]]) -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) -// CHECK: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read) -// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>) -// CHECK: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_4]], Write) -// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>) -// CHECK: %[[DMA_3:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c0, %c1] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_5]] -// CHECK: %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]]) -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) -// CHECK: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read) -// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>) -// CHECK: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_5]], Write) -// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>) +// CHECK-DAG: %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]], in : [%[[DMA_1]]], out : [%[[DMA_2]]]) +// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_4]], Write) +// CHECK-DAG: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read) +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>) +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>) +// CHECK-DAG: %[[DMA_3:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c0, %c1] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_5]] +// CHECK-DAG: %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]], in : [%[[DMA_1]]], out : [%[[DMA_3]]]) +// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_5]], Write) +// CHECK-DAG: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read) +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>) +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>) // CHECK-DAG: %[[DMA_4:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]][] [] [], %[[FROM_MEMREF_1]] // CHECK-DAG: %[[DMA_5:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c1, %c0] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_6]] -// CHECK: %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]]) -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_4]]) -// CHECK: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read) -// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>) -// CHECK: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_6]], Write) -// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>) -// CHECK: %[[DMA_6:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c1, %c1] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_7]] -// CHECK: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]]) -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_4]]) -// CHECK: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read) -// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>) -// CHECK: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_7]], Write) -// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>) -// CHECK: %[[DMA_7:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_9]][%[[ARG1]]] [%c1] [%c1], %[[FROM_MEMREF_8]] +// CHECK-DAG: %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]], in : [%[[DMA_4]]], out : [%[[DMA_5]]]) +// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_6]], Write) +// CHECK-DAG: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read) +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>) +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>) +// CHECK-DAG: %[[DMA_6:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c1, %c1] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_7]] +// CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]], in : [%[[DMA_4]]], out : [%[[DMA_6]]]) +// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_7]], Write) +// CHECK-DAG: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read) +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>) +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>) +// CHECK-DAG: %[[DMA_7:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_9]][%[[ARG1]]] [%c1] [%c1], %[[FROM_MEMREF_8]] module { func.func @nested_dma_dependencies() { %c0_i32 = arith.constant 0 : i32 @@ -579,11 +556,9 @@ module { %8 = amdaie.dma_cpy_nd(%4[%arg2, %arg3] [%c1, %c1] [%c1, %c1], %3[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %add = arith.addi %arg2, %c2 : index %tile = amdaie.tile(%arg3, %add) - %core = amdaie.core(%tile) { - amdaie.logicalobjectfifo.consume(%7) + %core = amdaie.core(%tile, in : [%7], out : [%8]) { linalg.fill ins(%c0_i32 : i32) outs(%alloc_2 : memref<32x64xi32, 2>) linalg.fill ins(%c0_i32 : i32) outs(%alloc_3 : memref<32x32xi32, 2>) - amdaie.logicalobjectfifo.produce(%8) amdaie.end } } {mapping = [#gpu.thread, #gpu.thread]} @@ -625,25 +600,21 @@ module { // CHECK-DAG: %[[FROM_MEMREF_4:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_1]]} // CHECK-DAG: %[[FROM_MEMREF_5:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_0]]} // CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]][%c0, %c0] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_0]][%c0, %c0] [%c32, %c32] [%c32, %c1] -// CHECK-DAG: %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]]) +// CHECK-DAG: %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]], in : [], out : [%[[DMA_0]]]) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_0]], Write) // CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<1x1x32x32xi32, 2>) -// CHECK-DAG: amdaie.logicalobjectfifo.produce(%[[DMA_0]]) // CHECK-DAG: %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]][%c0, %c1] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_1]][%c0, %c0] [%c32, %c32] [%c32, %c1] -// CHECK-DAG: %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]]) +// CHECK-DAG: %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]], in : [], out : [%[[DMA_1]]]) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Write) // CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<1x1x32x32xi32, 2>) -// CHECK-DAG: amdaie.logicalobjectfifo.produce(%[[DMA_1]]) // CHECK-DAG: %[[DMA_2:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]][%c1, %c0] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_2]][%c0, %c0] [%c32, %c32] [%c32, %c1] -// CHECK-DAG: %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]]) +// CHECK-DAG: %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]], in : [], out : [%[[DMA_2]]]) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Write) // CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<1x1x32x32xi32, 2>) -// CHECK-DAG: amdaie.logicalobjectfifo.produce(%[[DMA_2]]) // CHECK-DAG: %[[DMA_3:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]][%c1, %c1] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_3]][%c0, %c0] [%c32, %c32] [%c32, %c1] -// CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]]) +// CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]], in : [], out : [%[[DMA_3]]]) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Write) // CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<1x1x32x32xi32, 2>) -// CHECK-DAG: amdaie.logicalobjectfifo.produce(%[[DMA_3]]) module { func.func @local_subview_output() { %c0_i32 = arith.constant 0 : i32 @@ -661,9 +632,8 @@ module { %8 = amdaie.dma_cpy_nd(%1[%arg2, %arg3] [%c1, %c1] [%c1, %c1], %0[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %add = arith.addi %arg2, %c2 : index %tile = amdaie.tile(%arg3, %add) - %core = amdaie.core(%tile) { + %core = amdaie.core(%tile, in : [], out : [%8]) { linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 2>) - amdaie.logicalobjectfifo.produce(%8) amdaie.end } } {mapping = [#gpu.thread, #gpu.thread]} @@ -702,7 +672,7 @@ func.func @l1_temporary_buffer_for_matmul_elem() { %subview = memref.subview %alloc_6[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> %26 = arith.addi %arg2, %c2 : index %tile = amdaie.tile(%arg3, %26) - %27 = amdaie.core(%tile) { + %27 = amdaie.core(%tile, in : [], out : []) { linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) amdaie.end } @@ -757,13 +727,10 @@ func.func @l1_temporary_buffer_for_matmul_elem() { // CHECK-SAME: %[[FROM_MEMREF_6]] // CHECK-DAG: %[[DMA_5:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_10]] // CHECK-SAME: %[[FROM_MEMREF_4]] -// CHECK-DAG: %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]]) +// CHECK-DAG: %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]], in : [%[[DMA_1]], %[[DMA_3]]], out : [%[[DMA_4]]]) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_7]], Read) // CHECK-DAG: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_9]], Read) // CHECK-DAG: %[[VAL_2:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_6]], Write) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_3]]) -// CHECK-DAG: amdaie.logicalobjectfifo.produce(%[[DMA_4]]) // CHECK-DAG: %[[DMA_6:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]] // CHECK-SAME: %[[FROM_MEMREF_12]] // CHECK-DAG: %[[DMA_7:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]] @@ -772,13 +739,10 @@ func.func @l1_temporary_buffer_for_matmul_elem() { // CHECK-SAME: %[[FROM_MEMREF_5]] // CHECK-DAG: %[[DMA_9:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_10]] // CHECK-SAME: %[[FROM_MEMREF_3]] -// CHECK-DAG: %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]]) +// CHECK-DAG: %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]], in : [%[[DMA_1]], %[[DMA_7]]], out : [%[[DMA_8]]]) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_7]], Read) // CHECK-DAG: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_8]], Read) // CHECK-DAG: %[[VAL_2:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_5]], Write) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_7]]) -// CHECK-DAG: amdaie.logicalobjectfifo.produce(%[[DMA_8]]) #map = affine_map<(d0) -> (d0 * 32)> #map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)> #map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)> @@ -829,16 +793,13 @@ module { %19 = amdaie.dma_cpy_nd(%9[%12, %13] [%c32, %c32] [%c64, %c1], %5[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %20 = arith.addi %arg2, %c2 : index %tile = amdaie.tile(%arg3, %20) - %21 = amdaie.core(%tile) { - amdaie.logicalobjectfifo.consume(%16) - amdaie.logicalobjectfifo.consume(%17) + %21 = amdaie.core(%tile, in : [%16, %17], out : [%18]) { linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<8x8x4x8xi32, 2>, memref<4x8x8x8xi32, 2>) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) { ^bb0(%in: i32, %in_5: i32, %out: i32): %22 = arith.muli %in, %in_5 : i32 %23 = arith.addi %out, %22 : i32 linalg.yield %23 : i32 } - amdaie.logicalobjectfifo.produce(%18) amdaie.end } } {mapping = [#gpu.thread, #gpu.thread]} @@ -880,10 +841,7 @@ module { // CHECK-SAME: %[[FROM_MEMREF_0]] // CHECK-DAG: %[[DMA_2:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]] // CHECK-SAME: %[[FROM_MEMREF_5]] -// CHECK-DAG: %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) -// CHECK-DAG: amdaie.logicalobjectfifo.produce(%[[DMA_2]]) +// CHECK-DAG: %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]], in : [%[[DMA_0]], %[[DMA_1]]], out : [%[[DMA_2]]]) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read) // CHECK-DAG: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_4]], Read) // CHECK-DAG: %[[VAL_2:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_5]], Write) @@ -941,9 +899,7 @@ module { %21 = amdaie.dma_cpy_nd(%2[%arg2, %arg3, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %17[%arg2, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %22 = arith.addi %arg2, %c2 : index %tile = amdaie.tile(%arg3, %22) - %23 = amdaie.core(%tile) { - amdaie.logicalobjectfifo.consume(%19) - amdaie.logicalobjectfifo.consume(%20) + %23 = amdaie.core(%tile, in : [%19, %20], out : [%21]) { scf.for %arg4 = %c0 to %c16 step %c1 { scf.for %arg5 = %c0 to %c16 step %c1 { scf.for %arg6 = %c0 to %c8 step %c1 { @@ -957,7 +913,6 @@ module { } } } - amdaie.logicalobjectfifo.produce(%21) amdaie.end } } {mapping = [#gpu.thread, #gpu.thread]} @@ -995,12 +950,10 @@ module { // CHECK-DAG: %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]] // CHECK-SAME: %[[FROM_MEMREF_0]] // CHECK-DAG: %[[FROM_MEMREF_4:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_5]], {%[[TILE_0_2]]} -// CHECK-DAG: %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]]) +// CHECK-DAG: %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]], in : [%[[DMA_0]], %[[DMA_1]]], out : []) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read) // CHECK-DAG: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read) // CHECK-DAG: %[[VAL_2:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_4]], None) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) // CHECK-DAG: linalg.fill // CHECK-DAG: memref.extract_strided_metadata %[[VAL_1]] // CHECK-DAG: memref.extract_strided_metadata %[[VAL_0]] @@ -1043,9 +996,7 @@ module { %subview = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> %21 = arith.addi %arg2, %c2 : index %tile = amdaie.tile(%arg3, %21) - %22 = amdaie.core(%tile) { - amdaie.logicalobjectfifo.consume(%19) - amdaie.logicalobjectfifo.consume(%20) + %22 = amdaie.core(%tile, in : [%19, %20], out : []) { linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) %base_buffer, %offset, %sizes:6, %strides:6 = memref.extract_strided_metadata %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32> -> memref, index, index, index, index, index, index, index, index, index, index, index, index, index %base_buffer_5, %offset_6, %sizes_7:6, %strides_8:6 = memref.extract_strided_metadata %alloc : memref<1x1x8x4x8x4xi32, 2 : i32> -> memref, index, index, index, index, index, index, index, index, index, index, index, index, index diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/flatten_logical_objectfifo.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/flatten_logical_objectfifo.mlir index ac3c01cf2..60e6330c2 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/flatten_logical_objectfifo.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/flatten_logical_objectfifo.mlir @@ -37,7 +37,7 @@ module { %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> %2 = amdaie.circular_dma_cpy_nd(%0[%c0] [%c1024] [%c1], %1[%c0, %c0, %c0] [%c8, %c32, %c4] [%c4, %c32, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %3 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_2} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %4 = amdaie.core(%tile_2) { + %4 = amdaie.core(%tile_2, in : [%2], out : []) { scf.forall (%arg0, %arg1) in (2, 2) { %5 = amdaie.logicalobjectfifo.access(%0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> %6 = amdaie.logicalobjectfifo.access(%3, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_cores.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_cores.mlir index aba2deaaa..6a043cfc1 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_cores.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_cores.mlir @@ -22,10 +22,8 @@ func.func @insert_cores_with_non_normalized_forall() { // CHECK: %[[C2:.*]] = arith.constant 2 : index // CHECK: %[[ADD:.*]] = arith.addi %[[ARG2]], %[[C2]] : index // CHECK: %[[TILE0:.*]] = amdaie.tile(%[[ARG3]], %[[ADD]]) -// CHECK: %[[CORE0:.*]] = amdaie.core(%[[TILE0]]) { +// CHECK: %[[CORE0:.*]] = amdaie.core(%[[TILE0]], in : [%[[DMA_CPY2]], %[[DMA_CPY3]]], out : []) { // CHECK: linalg.fill -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_CPY2]]) -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_CPY3]]) // CHECK: linalg.generic // CHECK: amdaie.end // CHECK: } @@ -40,12 +38,9 @@ func.func @insert_cores_with_non_normalized_forall() { // CHECK: %[[C2:.*]] = arith.constant 2 : index // CHECK: %[[ADD:.*]] = arith.addi %[[ARG2]], %[[C2]] : index // CHECK: %[[TILE1:.*]] = amdaie.tile(%[[ARG3]], %[[ADD]]) -// CHECK: %[[CORE1:.*]] = amdaie.core(%[[TILE1]]) { +// CHECK: %[[CORE1:.*]] = amdaie.core(%[[TILE1]], in : [%[[DMA_CPY2]], %[[DMA_CPY3]]], out : [%[[DMA_CPY4]]]) { // CHECK: linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_CPY2]]) -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_CPY3]]) // CHECK: linalg.generic -// CHECK: amdaie.logicalobjectfifo.produce(%[[DMA_CPY4]]) // CHECK: amdaie.end // CHECK: } // CHECK: } {mapping = [#gpu.thread, #gpu.thread]} @@ -142,9 +137,7 @@ module { // CHECK: %[[C2:.*]] = arith.constant 2 : index // CHECK: %[[ADD:.*]] = arith.addi %[[ARG2]], %[[C2]] : index // CHECK: %[[TILE0:.*]] = amdaie.tile(%[[ARG3]], %[[ADD]]) -// CHECK: amdaie.core(%[[TILE0]]) { -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_CPY0]]) -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_CPY1]]) +// CHECK: amdaie.core(%[[TILE0]], in : [%[[DMA_CPY0]], %[[DMA_CPY1]]], out : []) { // CHECK: linalg.fill // CHECK: scf.for // CHECK: scf.for @@ -226,9 +219,7 @@ module { // CHECK: %[[C2:.*]] = arith.constant 2 : index // CHECK: %[[ADD:.*]] = arith.addi %[[ARG2]], %[[C2]] : index // CHECK: %[[TILE0:.*]] = amdaie.tile(%[[ARG3]], %[[ADD]]) -// CHECK: amdaie.core(%[[TILE0]]) { -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_CPY0]]) -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_CPY1]]) +// CHECK: amdaie.core(%[[TILE0]], in : [%[[DMA_CPY0]], %[[DMA_CPY1]]], out : []) { // CHECK: linalg.fill // CHECK: memref.extract_strided_metadata // CHECK: memref.extract_strided_metadata diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir index 4c0555334..139d0b5bf 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir @@ -192,7 +192,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %dma1 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core_0_0 = amdaie.core(%tile_0_2) { + %core_0_0 = amdaie.core(%tile_0_2, in : [], out : [%dma0]) { %0 = amdaie.logicalobjectfifo.acquire(%dma0, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.access(%0, Write) : !amdaie.logicalobjectfifo> -> memref<32x32xi32, 1> linalg.fill ins(%c0_i32 : i32) outs(%1 : memref<32x32xi32, 1>) @@ -258,7 +258,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %obj4 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile_0_2} : memref<4x8x4x8xf32, 1> -> !amdaie.logicalobjectfifo> %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %dma1 = amdaie.circular_dma_cpy_nd(%obj3[] [] [], %obj4[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core_0_0 = amdaie.core(%tile_0_2) { + %core_0_0 = amdaie.core(%tile_0_2, in : [], out : [%dma0, %dma1]) { %0 = amdaie.logicalobjectfifo.acquire(%dma0, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.access(%0, Write) : !amdaie.logicalobjectfifo> -> memref<32x32xi32, 2> %2 = amdaie.logicalobjectfifo.acquire(%dma1, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo> @@ -323,11 +323,11 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2, %tile_1_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %dma1 = amdaie.circular_dma_cpy_nd(%obj2[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core_0_2 = amdaie.core(%tile_0_2) { + %core_0_2 = amdaie.core(%tile_0_2, in : [%dma1], out : []) { %0 = amdaie.logicalobjectfifo.acquire(%dma1, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo> amdaie.end } - %core_1_2 = amdaie.core(%tile_1_2) { + %core_1_2 = amdaie.core(%tile_1_2, in : [%dma1], out : []) { %0 = amdaie.logicalobjectfifo.acquire(%dma1, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo> amdaie.end } @@ -372,7 +372,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %dma1 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core_0_0 = amdaie.core(%tile_0_2) { + %core_0_0 = amdaie.core(%tile_0_2, in : [], out : [%dma0]) { amdaie.logicalobjectfifo.release(%dma0, Produce) {size = 1 : i32} amdaie.end } @@ -875,7 +875,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %dma1 = amdaie.circular_dma_cpy_nd(%obj2[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.logicalobjectfifo.link[%dma0] -> [%dma1] () - %core_0_2 = amdaie.core(%tile_0_2) { + %core_0_2 = amdaie.core(%tile_0_2, in : [%dma1], out : []) { %1 = amdaie.logicalobjectfifo.acquire(%dma1, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo> %2 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<4x8x4x8xi32, 2> scf.for %arg2 = %c0 to %c8 step %c1 { @@ -884,7 +884,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.logicalobjectfifo.release(%dma1, Consume) {size = 1 : i32} amdaie.end } - %core_1_2 = amdaie.core(%tile_1_2) { + %core_1_2 = amdaie.core(%tile_1_2, in : [%dma1], out : []) { %1 = amdaie.logicalobjectfifo.acquire(%dma1, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo> %2 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<4x8x4x8xi32, 2> scf.for %arg2 = %c0 to %c8 step %c1 { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/none_access_to_temporary_buffer.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/none_access_to_temporary_buffer.mlir index 2780da1b6..502f51f54 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/none_access_to_temporary_buffer.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/none_access_to_temporary_buffer.mlir @@ -11,7 +11,7 @@ func.func @none_access_to_buffer(%arg0: !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> linalg.fill ins(%c0_i32 : i32) outs(%3 : memref<1x1x8x16xi32, 2>) amdaie.end @@ -43,20 +43,18 @@ func.func @single_none_access_multiple_users(%arg0: !amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %1 = amdaie.circular_dma_cpy_nd(%arg3[] [] [], %arg2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %2 = amdaie.logicalobjectfifo.from_memref %arg4, {%tile} : memref<1x1x8x16xi32, 2> -> !amdaie.logicalobjectfifo> - %3 = amdaie.core(%tile) { + %3 = amdaie.core(%tile, in : [%0], out : [%1]) { %4 = amdaie.logicalobjectfifo.acquire(%0, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo> %5 = amdaie.logicalobjectfifo.access(%4, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> %6 = amdaie.logicalobjectfifo.access(%2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> %7 = amdaie.logicalobjectfifo.acquire(%1, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo> %8 = amdaie.logicalobjectfifo.access(%7, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> - amdaie.logicalobjectfifo.consume(%0) linalg.fill ins(%c0_i32 : i32) outs(%5 : memref<1x1x8x16xi32, 2>) linalg.fill ins(%c0_i32 : i32) outs(%6 : memref<1x1x8x16xi32, 2>) linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : memref<1x1x8x16xi32, 2>) outs(%8 : memref<1x1x8x16xi32, 2>) { ^bb0(%in: i32, %out: i32): linalg.yield %in : i32 } - amdaie.logicalobjectfifo.produce(%1) amdaie.logicalobjectfifo.release(%0, Consume) {size = 1 : i32} amdaie.logicalobjectfifo.release(%1, Produce) {size = 1 : i32} amdaie.end @@ -97,31 +95,27 @@ func.func @multiple_none_access_multiple_users(%arg0: !amdaie.logicalobjectfifo< %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %1 = amdaie.circular_dma_cpy_nd(%arg3[] [] [], %arg2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %2 = amdaie.logicalobjectfifo.from_memref %arg4, {%tile} : memref<1x1x8x16xi32, 2> -> !amdaie.logicalobjectfifo> - %3 = amdaie.core(%tile) { + %3 = amdaie.core(%tile, in : [%0], out : [%1]) { %4 = amdaie.logicalobjectfifo.acquire(%0, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo> %5 = amdaie.logicalobjectfifo.access(%4, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> %6 = amdaie.logicalobjectfifo.access(%2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> - amdaie.logicalobjectfifo.consume(%0) linalg.fill ins(%c0_i32 : i32) outs(%5 : memref<1x1x8x16xi32, 2>) linalg.fill ins(%c0_i32 : i32) outs(%6 : memref<1x1x8x16xi32, 2>) scf.for %arg5 = %c0 to %c8 step %c1 { amdaie.logicalobjectfifo.release(%0, Consume) {size = 1 : i32} %10 = amdaie.logicalobjectfifo.acquire(%0, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo> %11 = amdaie.logicalobjectfifo.access(%10, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> - amdaie.logicalobjectfifo.consume(%0) linalg.fill ins(%c0_i32 : i32) outs(%11 : memref<1x1x8x16xi32, 2>) } amdaie.logicalobjectfifo.release(%0, Consume) {size = 1 : i32} %7 = amdaie.logicalobjectfifo.access(%2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> %8 = amdaie.logicalobjectfifo.acquire(%1, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo> %9 = amdaie.logicalobjectfifo.access(%8, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> - amdaie.logicalobjectfifo.consume(%0) linalg.fill ins(%c0_i32 : i32) outs(%7 : memref<1x1x8x16xi32, 2>) linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : memref<1x1x8x16xi32, 2>) outs(%9 : memref<1x1x8x16xi32, 2>) { ^bb0(%in: i32, %out: i32): linalg.yield %in : i32 } - amdaie.logicalobjectfifo.produce(%1) amdaie.logicalobjectfifo.release(%1, Produce) {size = 1 : i32} amdaie.end }